thalamus/supervisor: reap channel daemons via SIGCHLD instead of SIG_IGN
SIGCHLD=SIG_IGN at main() was auto-reaping all children in the kernel, which broke tokio::process::Command::wait() — every tool that spawned a subprocess (bash, mcp clients) was getting ECHILD because tokio couldn't waitpid() on a child the kernel had already reaped. Replace with a SIGCHLD signal handler task that reaps only PIDs listed in channels_dir() (via waitpid(pid, WNOHANG) — ECHILD on non-child is a harmless no-op). Tokio-spawned children aren't in PID files, so tokio's own per-child wait paths are untouched. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
d95f3e9445
commit
0e459aae92
2 changed files with 49 additions and 2 deletions
|
|
@ -19,6 +19,51 @@ fn channels_dir() -> PathBuf {
|
||||||
.join(".consciousness/channels")
|
.join(".consciousness/channels")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Install a SIGCHLD-driven reaper for channel daemons.
|
||||||
|
///
|
||||||
|
/// We can't use SIGCHLD=SIG_IGN because that makes the kernel auto-reap
|
||||||
|
/// all children, and tokio::process::Command::wait() then returns ECHILD
|
||||||
|
/// (breaking every tool that spawns a subprocess — bash, mcp clients, etc.).
|
||||||
|
///
|
||||||
|
/// Instead, on each SIGCHLD we read PID files in channels_dir() and call
|
||||||
|
/// waitpid(pid, WNOHANG) on each. That reaps only our own zombie channel
|
||||||
|
/// daemons; waitpid on any other PID returns ECHILD (harmless no-op).
|
||||||
|
/// Tokio-spawned children aren't recorded in PID files, so tokio's own
|
||||||
|
/// per-child wait paths are left free to reap them.
|
||||||
|
pub fn start_zombie_reaper() -> tokio::task::JoinHandle<()> {
|
||||||
|
use tokio::signal::unix::{signal, SignalKind};
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut sig = match signal(SignalKind::child()) {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(e) => {
|
||||||
|
error!("failed to install SIGCHLD handler: {}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
while sig.recv().await.is_some() {
|
||||||
|
reap_channel_daemons();
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reap_channel_daemons() {
|
||||||
|
let entries = match std::fs::read_dir(channels_dir()) {
|
||||||
|
Ok(e) => e,
|
||||||
|
Err(_) => return,
|
||||||
|
};
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.extension().and_then(|s| s.to_str()) != Some("pid") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let Ok(s) = std::fs::read_to_string(&path) else { continue };
|
||||||
|
let Ok(pid) = s.trim().parse::<i32>() else { continue };
|
||||||
|
let mut status = 0;
|
||||||
|
// Reaps our zombie child; ECHILD on non-child is a no-op.
|
||||||
|
unsafe { libc::waitpid(pid, &mut status, libc::WNOHANG); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn config_path() -> PathBuf {
|
fn config_path() -> PathBuf {
|
||||||
channels_dir().join("channels.json5")
|
channels_dir().join("channels.json5")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -756,8 +756,10 @@ fn restore_stderr(original_fd: std::os::fd::RawFd) {
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() {
|
pub async fn main() {
|
||||||
// Auto-reap child processes (channel daemons outlive the supervisor)
|
// Reap channel-daemon zombies via a SIGCHLD handler that only touches
|
||||||
unsafe { libc::signal(libc::SIGCHLD, libc::SIG_IGN); }
|
// PIDs listed in channels_dir(). Avoids SIGCHLD=SIG_IGN, which would
|
||||||
|
// break tokio::process::Command::wait() (kernel auto-reap → ECHILD).
|
||||||
|
let _reaper = crate::thalamus::supervisor::start_zombie_reaper();
|
||||||
|
|
||||||
// Redirect stderr to pipe — logs to file and sends to channel for UI display
|
// Redirect stderr to pipe — logs to file and sends to channel for UI display
|
||||||
let stderr_capture = redirect_stderr_to_pipe();
|
let stderr_capture = redirect_stderr_to_pipe();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue