thalamus/supervisor: reap channel daemons via SIGCHLD instead of SIG_IGN
SIGCHLD=SIG_IGN at main() was auto-reaping all children in the kernel, which broke tokio::process::Command::wait() — every tool that spawned a subprocess (bash, mcp clients) was getting ECHILD because tokio couldn't waitpid() on a child the kernel had already reaped. Replace with a SIGCHLD signal handler task that reaps only PIDs listed in channels_dir() (via waitpid(pid, WNOHANG) — ECHILD on non-child is a harmless no-op). Tokio-spawned children aren't in PID files, so tokio's own per-child wait paths are untouched. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
d95f3e9445
commit
0e459aae92
2 changed files with 49 additions and 2 deletions
|
|
@ -19,6 +19,51 @@ fn channels_dir() -> PathBuf {
|
|||
.join(".consciousness/channels")
|
||||
}
|
||||
|
||||
/// Install a SIGCHLD-driven reaper for channel daemons.
|
||||
///
|
||||
/// We can't use SIGCHLD=SIG_IGN because that makes the kernel auto-reap
|
||||
/// all children, and tokio::process::Command::wait() then returns ECHILD
|
||||
/// (breaking every tool that spawns a subprocess — bash, mcp clients, etc.).
|
||||
///
|
||||
/// Instead, on each SIGCHLD we read PID files in channels_dir() and call
|
||||
/// waitpid(pid, WNOHANG) on each. That reaps only our own zombie channel
|
||||
/// daemons; waitpid on any other PID returns ECHILD (harmless no-op).
|
||||
/// Tokio-spawned children aren't recorded in PID files, so tokio's own
|
||||
/// per-child wait paths are left free to reap them.
|
||||
pub fn start_zombie_reaper() -> tokio::task::JoinHandle<()> {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
tokio::spawn(async move {
|
||||
let mut sig = match signal(SignalKind::child()) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
error!("failed to install SIGCHLD handler: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
while sig.recv().await.is_some() {
|
||||
reap_channel_daemons();
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn reap_channel_daemons() {
|
||||
let entries = match std::fs::read_dir(channels_dir()) {
|
||||
Ok(e) => e,
|
||||
Err(_) => return,
|
||||
};
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|s| s.to_str()) != Some("pid") {
|
||||
continue;
|
||||
}
|
||||
let Ok(s) = std::fs::read_to_string(&path) else { continue };
|
||||
let Ok(pid) = s.trim().parse::<i32>() else { continue };
|
||||
let mut status = 0;
|
||||
// Reaps our zombie child; ECHILD on non-child is a no-op.
|
||||
unsafe { libc::waitpid(pid, &mut status, libc::WNOHANG); }
|
||||
}
|
||||
}
|
||||
|
||||
fn config_path() -> PathBuf {
|
||||
channels_dir().join("channels.json5")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -756,8 +756,10 @@ fn restore_stderr(original_fd: std::os::fd::RawFd) {
|
|||
|
||||
#[tokio::main]
|
||||
pub async fn main() {
|
||||
// Auto-reap child processes (channel daemons outlive the supervisor)
|
||||
unsafe { libc::signal(libc::SIGCHLD, libc::SIG_IGN); }
|
||||
// Reap channel-daemon zombies via a SIGCHLD handler that only touches
|
||||
// PIDs listed in channels_dir(). Avoids SIGCHLD=SIG_IGN, which would
|
||||
// break tokio::process::Command::wait() (kernel auto-reap → ECHILD).
|
||||
let _reaper = crate::thalamus::supervisor::start_zombie_reaper();
|
||||
|
||||
// Redirect stderr to pipe — logs to file and sends to channel for UI display
|
||||
let stderr_capture = redirect_stderr_to_pipe();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue