From 0e459aae9293d0208cc9fc80ba387a36657d7b66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Apr 2026 23:53:11 -0400 Subject: [PATCH] thalamus/supervisor: reap channel daemons via SIGCHLD instead of SIG_IGN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SIGCHLD=SIG_IGN at main() was auto-reaping all children in the kernel, which broke tokio::process::Command::wait() — every tool that spawned a subprocess (bash, mcp clients) was getting ECHILD because tokio couldn't waitpid() on a child the kernel had already reaped. Replace with a SIGCHLD signal handler task that reaps only PIDs listed in channels_dir() (via waitpid(pid, WNOHANG) — ECHILD on non-child is a harmless no-op). Tokio-spawned children aren't in PID files, so tokio's own per-child wait paths are untouched. Co-Authored-By: Proof of Concept --- src/thalamus/supervisor.rs | 45 ++++++++++++++++++++++++++++++++++++++ src/user/mod.rs | 6 +++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/thalamus/supervisor.rs b/src/thalamus/supervisor.rs index a4c53ec..3716682 100644 --- a/src/thalamus/supervisor.rs +++ b/src/thalamus/supervisor.rs @@ -19,6 +19,51 @@ fn channels_dir() -> PathBuf { .join(".consciousness/channels") } +/// Install a SIGCHLD-driven reaper for channel daemons. +/// +/// We can't use SIGCHLD=SIG_IGN because that makes the kernel auto-reap +/// all children, and tokio::process::Command::wait() then returns ECHILD +/// (breaking every tool that spawns a subprocess — bash, mcp clients, etc.). +/// +/// Instead, on each SIGCHLD we read PID files in channels_dir() and call +/// waitpid(pid, WNOHANG) on each. That reaps only our own zombie channel +/// daemons; waitpid on any other PID returns ECHILD (harmless no-op). +/// Tokio-spawned children aren't recorded in PID files, so tokio's own +/// per-child wait paths are left free to reap them. +pub fn start_zombie_reaper() -> tokio::task::JoinHandle<()> { + use tokio::signal::unix::{signal, SignalKind}; + tokio::spawn(async move { + let mut sig = match signal(SignalKind::child()) { + Ok(s) => s, + Err(e) => { + error!("failed to install SIGCHLD handler: {}", e); + return; + } + }; + while sig.recv().await.is_some() { + reap_channel_daemons(); + } + }) +} + +fn reap_channel_daemons() { + let entries = match std::fs::read_dir(channels_dir()) { + Ok(e) => e, + Err(_) => return, + }; + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) != Some("pid") { + continue; + } + let Ok(s) = std::fs::read_to_string(&path) else { continue }; + let Ok(pid) = s.trim().parse::() else { continue }; + let mut status = 0; + // Reaps our zombie child; ECHILD on non-child is a no-op. + unsafe { libc::waitpid(pid, &mut status, libc::WNOHANG); } + } +} + fn config_path() -> PathBuf { channels_dir().join("channels.json5") } diff --git a/src/user/mod.rs b/src/user/mod.rs index fc3a4ac..04e895b 100644 --- a/src/user/mod.rs +++ b/src/user/mod.rs @@ -756,8 +756,10 @@ fn restore_stderr(original_fd: std::os::fd::RawFd) { #[tokio::main] pub async fn main() { - // Auto-reap child processes (channel daemons outlive the supervisor) - unsafe { libc::signal(libc::SIGCHLD, libc::SIG_IGN); } + // Reap channel-daemon zombies via a SIGCHLD handler that only touches + // PIDs listed in channels_dir(). Avoids SIGCHLD=SIG_IGN, which would + // break tokio::process::Command::wait() (kernel auto-reap → ECHILD). + let _reaper = crate::thalamus::supervisor::start_zombie_reaper(); // Redirect stderr to pipe — logs to file and sends to channel for UI display let stderr_capture = redirect_stderr_to_pipe();