Fix stale pid reaper: check /proc/pid/cmdline to detect PID reuse

The reaper checks if agent PIDs are alive via kill(pid, 0), but if
the PID was reused by an unrelated process, the check succeeds and
the stale pid file blocks the agent from re-launching indefinitely.

Fix: read /proc/pid/cmdline and verify the process is actually a
claude/poc-memory process. If not, remove the pid file.

This caused memory surfacing to stop working for the entire April 7
session — a dead surface-observe process's PID was reused, blocking
all subsequent surfacing attempts with "already running".

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-08 09:18:21 -04:00
parent 7ecc50d2e4
commit 6ce3f78e0a

View file

@ -85,6 +85,8 @@ fn check_notifications() {
/// Check for stale agent processes in a state dir.
/// Cleans up pid files for dead processes and kills timed-out ones.
/// Also detects PID reuse by checking if the process is actually a
/// claude/poc-memory process (reads /proc/pid/cmdline).
fn reap_agent_pids(state_dir: &std::path::Path, timeout_secs: u64) {
let Ok(entries) = fs::read_dir(state_dir) else { return };
for entry in entries.flatten() {
@ -93,11 +95,23 @@ fn reap_agent_pids(state_dir: &std::path::Path, timeout_secs: u64) {
let Some(pid_str) = name_str.strip_prefix("pid-") else { continue };
let Ok(pid) = pid_str.parse::<i32>() else { continue };
// Check if the process is actually alive
if unsafe { libc::kill(pid, 0) } != 0 {
fs::remove_file(entry.path()).ok();
continue;
}
// Check if the PID still belongs to a claude/poc-memory process.
// PID reuse by an unrelated process would otherwise block the
// agent from being re-launched.
let is_ours = fs::read_to_string(format!("/proc/{}/cmdline", pid))
.map(|cmd| cmd.contains("claude") || cmd.contains("poc-memory"))
.unwrap_or(false);
if !is_ours {
fs::remove_file(entry.path()).ok();
continue;
}
if timeout_secs > 0 {
if let Ok(meta) = entry.metadata() {
if let Ok(modified) = meta.modified() {