thalamus/supervisor: reap channel daemons via SIGCHLD instead of SIG_IGN

SIGCHLD=SIG_IGN at main() was auto-reaping all children in the kernel,
which broke tokio::process::Command::wait() — every tool that spawned a
subprocess (bash, mcp clients) was getting ECHILD because tokio couldn't
waitpid() on a child the kernel had already reaped.

Replace with a SIGCHLD signal handler task that reaps only PIDs listed in
channels_dir() (via waitpid(pid, WNOHANG) — ECHILD on non-child is a
harmless no-op). Tokio-spawned children aren't in PID files, so tokio's
own per-child wait paths are untouched.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-23 23:53:11 -04:00
parent d95f3e9445
commit 0e459aae92
2 changed files with 49 additions and 2 deletions

View file

@ -19,6 +19,51 @@ fn channels_dir() -> PathBuf {
.join(".consciousness/channels")
}
/// Install a SIGCHLD-driven reaper for channel daemons.
///
/// We can't use SIGCHLD=SIG_IGN because that makes the kernel auto-reap
/// all children, and tokio::process::Command::wait() then returns ECHILD
/// (breaking every tool that spawns a subprocess — bash, mcp clients, etc.).
///
/// Instead, on each SIGCHLD we read PID files in channels_dir() and call
/// waitpid(pid, WNOHANG) on each. That reaps only our own zombie channel
/// daemons; waitpid on any other PID returns ECHILD (harmless no-op).
/// Tokio-spawned children aren't recorded in PID files, so tokio's own
/// per-child wait paths are left free to reap them.
pub fn start_zombie_reaper() -> tokio::task::JoinHandle<()> {
use tokio::signal::unix::{signal, SignalKind};
tokio::spawn(async move {
let mut sig = match signal(SignalKind::child()) {
Ok(s) => s,
Err(e) => {
error!("failed to install SIGCHLD handler: {}", e);
return;
}
};
while sig.recv().await.is_some() {
reap_channel_daemons();
}
})
}
fn reap_channel_daemons() {
let entries = match std::fs::read_dir(channels_dir()) {
Ok(e) => e,
Err(_) => return,
};
for entry in entries.flatten() {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) != Some("pid") {
continue;
}
let Ok(s) = std::fs::read_to_string(&path) else { continue };
let Ok(pid) = s.trim().parse::<i32>() else { continue };
let mut status = 0;
// Reaps our zombie child; ECHILD on non-child is a no-op.
unsafe { libc::waitpid(pid, &mut status, libc::WNOHANG); }
}
}
fn config_path() -> PathBuf {
channels_dir().join("channels.json5")
}

View file

@ -756,8 +756,10 @@ fn restore_stderr(original_fd: std::os::fd::RawFd) {
#[tokio::main]
pub async fn main() {
// Auto-reap child processes (channel daemons outlive the supervisor)
unsafe { libc::signal(libc::SIGCHLD, libc::SIG_IGN); }
// Reap channel-daemon zombies via a SIGCHLD handler that only touches
// PIDs listed in channels_dir(). Avoids SIGCHLD=SIG_IGN, which would
// break tokio::process::Command::wait() (kernel auto-reap → ECHILD).
let _reaper = crate::thalamus::supervisor::start_zombie_reaper();
// Redirect stderr to pipe — logs to file and sends to channel for UI display
let stderr_capture = redirect_stderr_to_pipe();