agents: bail script support, pid file simplification, cleanup

- Bail command moved from hardcoded closure to external script
  specified in agent JSON header ("bail": "bail-no-competing.sh")
- Runner executes script between steps with pid file path as $1,
  cwd = state dir. Non-zero exit stops the pipeline.
- PID files simplified to just the phase name (no JSON) for easy
  bash inspection (cat pid-*)
- scan_pid_files helper deduplicates pid scanning logic
- Timeout check uses file mtime instead of embedded timestamp
- PID file cleaned up on bail/error (not just success)
- output() tool validates key names (rejects pid-*, /, ..)
- Agent log files append instead of truncate
- Fixed orphaned derive and doc comment on AgentStep/AgentDef
- Phase written after bail check passes, not before

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-03-26 15:20:29 -04:00
parent e20aeeeabe
commit 52703b4637
5 changed files with 135 additions and 85 deletions

View file

@ -129,6 +129,46 @@ fn mark_seen(dir: &Path, session_id: &str, key: &str, seen: &mut HashSet<String>
}
}
/// Check for live agent processes in a state dir. Returns (phase, pid) pairs.
/// Cleans up stale pid files and kills timed-out processes.
fn scan_pid_files(state_dir: &Path, timeout_secs: u64, self_pid: u32) -> Vec<(String, u32)> {
let mut live = Vec::new();
let Ok(entries) = fs::read_dir(state_dir) else { return live };
for entry in entries.flatten() {
let name = entry.file_name();
let name_str = name.to_string_lossy();
if !name_str.starts_with("pid-") { continue; }
let pid: u32 = name_str.strip_prefix("pid-")
.and_then(|s| s.parse().ok())
.unwrap_or(0);
if pid == 0 || pid == self_pid { continue; }
if unsafe { libc::kill(pid as i32, 0) } != 0 {
fs::remove_file(entry.path()).ok();
continue;
}
// Timeout via mtime
if timeout_secs > 0 {
if let Ok(meta) = entry.metadata() {
if let Ok(modified) = meta.modified() {
if modified.elapsed().unwrap_or_default().as_secs() > timeout_secs {
unsafe { libc::kill(pid as i32, libc::SIGTERM); }
fs::remove_file(entry.path()).ok();
continue;
}
}
}
}
let phase = fs::read_to_string(entry.path())
.unwrap_or_default()
.trim().to_string();
live.push((phase, pid));
}
live
}
/// Unified agent cycle — runs surface-observe agent with state dir.
/// Reads output files for surface results, spawns new agent when ready.
///
@ -144,50 +184,12 @@ fn surface_observe_cycle(session: &Session, out: &mut String, log_f: &mut File)
.surface_timeout_secs
.unwrap_or(300) as u64;
// Scan pid files — find live agents and their phases
let mut any_in_surface = false;
let mut any_alive = false;
if let Ok(entries) = fs::read_dir(&state_dir) {
for entry in entries.flatten() {
let name = entry.file_name();
let name_str = name.to_string_lossy();
if !name_str.starts_with("pid-") { continue; }
let pid: u32 = name_str.strip_prefix("pid-")
.and_then(|s| s.parse().ok())
.unwrap_or(0);
if pid == 0 { continue; }
let alive = unsafe { libc::kill(pid as i32, 0) == 0 };
if !alive {
let _ = writeln!(log_f, "cleanup stale pid-{}", pid);
fs::remove_file(entry.path()).ok();
continue;
}
// Check for timeout
let phase_json = fs::read_to_string(entry.path()).unwrap_or_default();
let started: u64 = phase_json.split("\"started\":")
.nth(1)
.and_then(|s| s.trim_start().split(|c: char| !c.is_ascii_digit()).next())
.and_then(|s| s.parse().ok())
.unwrap_or(0);
if started > 0 && now_secs().saturating_sub(started) > timeout {
let _ = writeln!(log_f, "killing timed-out pid-{} ({}s)", pid, timeout);
unsafe { libc::kill(pid as i32, libc::SIGTERM); }
fs::remove_file(entry.path()).ok();
continue;
}
any_alive = true;
let in_surface = phase_json.contains("\"phase\":\"surface\"")
|| phase_json.contains("\"phase\":\"step-0\"");
if in_surface {
any_in_surface = true;
}
let _ = writeln!(log_f, "alive pid-{}: {}", pid, phase_json.trim());
}
let live = scan_pid_files(&state_dir, timeout, 0);
for (phase, pid) in &live {
let _ = writeln!(log_f, "alive pid-{}: phase={}", pid, phase);
}
let any_in_surface = live.iter().any(|(p, _)| p == "surface" || p == "step-0");
let any_alive = !live.is_empty();
// Read surface output and inject into context
let surface_path = state_dir.join("surface");
@ -230,21 +232,39 @@ fn surface_observe_cycle(session: &Session, out: &mut String, log_f: &mut File)
let _ = writeln!(log_f, "agent past surface, starting new (pipeline)");
}
if let Some(pid) = spawn_agent("surface-observe", &state_dir, &session.session_id) {
let _ = writeln!(log_f, "spawned pid {}", pid);
}
}
/// Spawn an agent asynchronously. Reads the .agent file to get the first
/// phase name, spawns the process, writes the pid file, and returns.
fn spawn_agent(agent_name: &str, state_dir: &Path, session_id: &str) -> Option<u32> {
// Read first phase from agent definition
let first_phase = crate::agents::defs::get_def(agent_name)
.and_then(|d| d.steps.first().map(|s| s.phase.clone()))
.unwrap_or_else(|| "step-0".into());
let log_dir = crate::store::memory_dir().join("logs");
fs::create_dir_all(&log_dir).ok();
let agent_log = fs::File::create(log_dir.join("surface-observe.log"))
let agent_log = fs::OpenOptions::new()
.create(true).append(true)
.open(log_dir.join(format!("{}.log", agent_name)))
.unwrap_or_else(|_| fs::File::create("/dev/null").unwrap());
if let Ok(child) = Command::new("poc-memory")
.args(["agent", "run", "surface-observe", "--count", "1", "--local",
let child = Command::new("poc-memory")
.args(["agent", "run", agent_name, "--count", "1", "--local",
"--state-dir", &state_dir.to_string_lossy()])
.env("POC_SESSION_ID", &session.session_id)
.env("POC_SESSION_ID", session_id)
.stdout(agent_log.try_clone().unwrap_or_else(|_| fs::File::create("/dev/null").unwrap()))
.stderr(agent_log)
.spawn()
{
let _ = writeln!(log_f, "spawned pid {}", child.id());
}
.ok()?;
let pid = child.id();
let pid_path = state_dir.join(format!("pid-{}", pid));
fs::write(&pid_path, &first_phase).ok();
Some(pid)
}
fn cleanup_stale_files(dir: &Path, max_age: Duration) {