From 52703b46379d7a37271d6bca4fdb4db2d5a8ff73 Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Thu, 26 Mar 2026 15:20:29 -0400 Subject: [PATCH] agents: bail script support, pid file simplification, cleanup - Bail command moved from hardcoded closure to external script specified in agent JSON header ("bail": "bail-no-competing.sh") - Runner executes script between steps with pid file path as $1, cwd = state dir. Non-zero exit stops the pipeline. - PID files simplified to just the phase name (no JSON) for easy bash inspection (cat pid-*) - scan_pid_files helper deduplicates pid scanning logic - Timeout check uses file mtime instead of embedded timestamp - PID file cleaned up on bail/error (not just success) - output() tool validates key names (rejects pid-*, /, ..) - Agent log files append instead of truncate - Fixed orphaned derive and doc comment on AgentStep/AgentDef - Phase written after bail check passes, not before Co-Authored-By: Kent Overstreet --- src/agent/tools/memory.rs | 3 + src/hippocampus/memory_search.rs | 120 +++++++++++-------- src/subconscious/agents/bail-no-competing.sh | 22 ++++ src/subconscious/defs.rs | 13 +- src/subconscious/knowledge.rs | 62 +++++----- 5 files changed, 135 insertions(+), 85 deletions(-) create mode 100755 src/subconscious/agents/bail-no-competing.sh diff --git a/src/agent/tools/memory.rs b/src/agent/tools/memory.rs index f805409..23626ce 100644 --- a/src/agent/tools/memory.rs +++ b/src/agent/tools/memory.rs @@ -139,6 +139,9 @@ pub fn dispatch(name: &str, args: &serde_json::Value, provenance: Option<&str>) } "output" => { let key = get_str(args, "key")?; + if key.starts_with("pid-") || key.contains('/') || key.contains("..") { + anyhow::bail!("invalid output key: {}", key); + } let value = get_str(args, "value")?; let dir = std::env::var("POC_AGENT_OUTPUT_DIR") .map_err(|_| anyhow::anyhow!("no output directory set"))?; diff --git a/src/hippocampus/memory_search.rs b/src/hippocampus/memory_search.rs index 0363e4a..039c57e 100644 --- a/src/hippocampus/memory_search.rs +++ b/src/hippocampus/memory_search.rs @@ -129,6 +129,46 @@ fn mark_seen(dir: &Path, session_id: &str, key: &str, seen: &mut HashSet } } +/// Check for live agent processes in a state dir. Returns (phase, pid) pairs. +/// Cleans up stale pid files and kills timed-out processes. +fn scan_pid_files(state_dir: &Path, timeout_secs: u64, self_pid: u32) -> Vec<(String, u32)> { + let mut live = Vec::new(); + let Ok(entries) = fs::read_dir(state_dir) else { return live }; + for entry in entries.flatten() { + let name = entry.file_name(); + let name_str = name.to_string_lossy(); + if !name_str.starts_with("pid-") { continue; } + let pid: u32 = name_str.strip_prefix("pid-") + .and_then(|s| s.parse().ok()) + .unwrap_or(0); + if pid == 0 || pid == self_pid { continue; } + + if unsafe { libc::kill(pid as i32, 0) } != 0 { + fs::remove_file(entry.path()).ok(); + continue; + } + + // Timeout via mtime + if timeout_secs > 0 { + if let Ok(meta) = entry.metadata() { + if let Ok(modified) = meta.modified() { + if modified.elapsed().unwrap_or_default().as_secs() > timeout_secs { + unsafe { libc::kill(pid as i32, libc::SIGTERM); } + fs::remove_file(entry.path()).ok(); + continue; + } + } + } + } + + let phase = fs::read_to_string(entry.path()) + .unwrap_or_default() + .trim().to_string(); + live.push((phase, pid)); + } + live +} + /// Unified agent cycle — runs surface-observe agent with state dir. /// Reads output files for surface results, spawns new agent when ready. /// @@ -144,50 +184,12 @@ fn surface_observe_cycle(session: &Session, out: &mut String, log_f: &mut File) .surface_timeout_secs .unwrap_or(300) as u64; - // Scan pid files — find live agents and their phases - let mut any_in_surface = false; - let mut any_alive = false; - if let Ok(entries) = fs::read_dir(&state_dir) { - for entry in entries.flatten() { - let name = entry.file_name(); - let name_str = name.to_string_lossy(); - if !name_str.starts_with("pid-") { continue; } - let pid: u32 = name_str.strip_prefix("pid-") - .and_then(|s| s.parse().ok()) - .unwrap_or(0); - if pid == 0 { continue; } - - let alive = unsafe { libc::kill(pid as i32, 0) == 0 }; - if !alive { - let _ = writeln!(log_f, "cleanup stale pid-{}", pid); - fs::remove_file(entry.path()).ok(); - continue; - } - - // Check for timeout - let phase_json = fs::read_to_string(entry.path()).unwrap_or_default(); - let started: u64 = phase_json.split("\"started\":") - .nth(1) - .and_then(|s| s.trim_start().split(|c: char| !c.is_ascii_digit()).next()) - .and_then(|s| s.parse().ok()) - .unwrap_or(0); - if started > 0 && now_secs().saturating_sub(started) > timeout { - let _ = writeln!(log_f, "killing timed-out pid-{} ({}s)", pid, timeout); - unsafe { libc::kill(pid as i32, libc::SIGTERM); } - fs::remove_file(entry.path()).ok(); - continue; - } - - any_alive = true; - - let in_surface = phase_json.contains("\"phase\":\"surface\"") - || phase_json.contains("\"phase\":\"step-0\""); - if in_surface { - any_in_surface = true; - } - let _ = writeln!(log_f, "alive pid-{}: {}", pid, phase_json.trim()); - } + let live = scan_pid_files(&state_dir, timeout, 0); + for (phase, pid) in &live { + let _ = writeln!(log_f, "alive pid-{}: phase={}", pid, phase); } + let any_in_surface = live.iter().any(|(p, _)| p == "surface" || p == "step-0"); + let any_alive = !live.is_empty(); // Read surface output and inject into context let surface_path = state_dir.join("surface"); @@ -230,21 +232,39 @@ fn surface_observe_cycle(session: &Session, out: &mut String, log_f: &mut File) let _ = writeln!(log_f, "agent past surface, starting new (pipeline)"); } + if let Some(pid) = spawn_agent("surface-observe", &state_dir, &session.session_id) { + let _ = writeln!(log_f, "spawned pid {}", pid); + } +} + +/// Spawn an agent asynchronously. Reads the .agent file to get the first +/// phase name, spawns the process, writes the pid file, and returns. +fn spawn_agent(agent_name: &str, state_dir: &Path, session_id: &str) -> Option { + // Read first phase from agent definition + let first_phase = crate::agents::defs::get_def(agent_name) + .and_then(|d| d.steps.first().map(|s| s.phase.clone())) + .unwrap_or_else(|| "step-0".into()); + let log_dir = crate::store::memory_dir().join("logs"); fs::create_dir_all(&log_dir).ok(); - let agent_log = fs::File::create(log_dir.join("surface-observe.log")) + let agent_log = fs::OpenOptions::new() + .create(true).append(true) + .open(log_dir.join(format!("{}.log", agent_name))) .unwrap_or_else(|_| fs::File::create("/dev/null").unwrap()); - if let Ok(child) = Command::new("poc-memory") - .args(["agent", "run", "surface-observe", "--count", "1", "--local", + let child = Command::new("poc-memory") + .args(["agent", "run", agent_name, "--count", "1", "--local", "--state-dir", &state_dir.to_string_lossy()]) - .env("POC_SESSION_ID", &session.session_id) + .env("POC_SESSION_ID", session_id) .stdout(agent_log.try_clone().unwrap_or_else(|_| fs::File::create("/dev/null").unwrap())) .stderr(agent_log) .spawn() - { - let _ = writeln!(log_f, "spawned pid {}", child.id()); - } + .ok()?; + + let pid = child.id(); + let pid_path = state_dir.join(format!("pid-{}", pid)); + fs::write(&pid_path, &first_phase).ok(); + Some(pid) } fn cleanup_stale_files(dir: &Path, max_age: Duration) { diff --git a/src/subconscious/agents/bail-no-competing.sh b/src/subconscious/agents/bail-no-competing.sh new file mode 100755 index 0000000..d030060 --- /dev/null +++ b/src/subconscious/agents/bail-no-competing.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Bail if other agents are alive in the state dir. +# $1 = path to this agent's pid file +# cwd = state dir +# +# Exit 0 = continue, exit 1 = bail + +my_pid_file=$(basename "$1") + +for f in pid-*; do + [ "$f" = "$my_pid_file" ] && continue + [ ! -f "$f" ] && continue + + pid=${f#pid-} + if kill -0 "$pid" 2>/dev/null; then + exit 1 # another agent is alive, bail + else + rm -f "$f" # stale, clean up + fi +done + +exit 0 diff --git a/src/subconscious/defs.rs b/src/subconscious/defs.rs index fe4dd39..17eee6a 100644 --- a/src/subconscious/defs.rs +++ b/src/subconscious/defs.rs @@ -24,8 +24,6 @@ use serde::Deserialize; use std::path::PathBuf; -/// Agent definition: config (from JSON header) + prompt (raw markdown body). -#[derive(Clone, Debug)] /// A single step in a multi-step agent. pub struct AgentStep { pub prompt: String, @@ -34,6 +32,7 @@ pub struct AgentStep { pub phase: String, } +/// Agent definition: config (from JSON header) + prompt steps. pub struct AgentDef { pub agent: String, pub query: String, @@ -47,6 +46,9 @@ pub struct AgentDef { pub chunk_size: Option, pub chunk_overlap: Option, pub temperature: Option, + /// Bail check command — run between steps with pid file path as $1, + /// cwd = state dir. Non-zero exit = stop the pipeline. + pub bail: Option, } /// The JSON header portion (first line of the file). @@ -73,6 +75,10 @@ struct AgentHeader { /// LLM temperature override #[serde(default)] temperature: Option, + /// Bail check command — run between steps with pid file path as $1, + /// cwd = state dir. Non-zero exit = stop the pipeline. + #[serde(default)] + bail: Option, } fn default_model() -> String { "sonnet".into() } @@ -143,10 +149,11 @@ fn parse_agent_file(content: &str) -> Option { chunk_size: header.chunk_size, chunk_overlap: header.chunk_overlap, temperature: header.temperature, + bail: header.bail, }) } -fn agents_dir() -> PathBuf { +pub fn agents_dir() -> PathBuf { let repo = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/subconscious/agents"); if repo.is_dir() { return repo; } crate::store::memory_dir().join("agents") diff --git a/src/subconscious/knowledge.rs b/src/subconscious/knowledge.rs index 5945ce6..b18d42d 100644 --- a/src/subconscious/knowledge.rs +++ b/src/subconscious/knowledge.rs @@ -174,15 +174,11 @@ fn run_one_agent_inner( // Safe: agent runs single-threaded, env var read only by our dispatch code unsafe { std::env::set_var("POC_AGENT_OUTPUT_DIR", &output_dir); } - // Write PID file with initial phase + // Write PID file — content is just the phase name let pid = std::process::id(); let pid_path = output_dir.join(format!("pid-{}", pid)); let write_pid = |phase: &str| { - let json = format!("{{\"phase\":\"{}\",\"started\":{}}}", phase, - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap().as_secs()); - fs::write(&pid_path, &json).ok(); + fs::write(&pid_path, phase).ok(); }; write_pid(&agent_batch.steps[0].phase); @@ -201,42 +197,44 @@ fn run_one_agent_inner( } log("\n=== CALLING LLM ==="); - // Bail check: between steps, check for other pid files in the state dir. - // If another agent has started, bail — let it have the resources. - let output_dir_clone = output_dir.clone(); + // Bail check: if the agent defines a bail script, run it between steps. + // The script receives the pid file path as $1, cwd = state dir. + let bail_script = def.bail.as_ref().map(|name| { + // Look for the script next to the .agent file + let agents_dir = super::defs::agents_dir(); + agents_dir.join(name) + }); + let output_dir_for_bail = output_dir.clone(); + let pid_path_for_bail = pid_path.clone(); let bail_fn = move |step_idx: usize| -> Result<(), String> { + // Update phase if step_idx < step_phases.len() { write_pid(&step_phases[step_idx]); } - // After step 0 (surface), check for competing agents - if step_idx > 0 { - if let Ok(entries) = fs::read_dir(&output_dir_clone) { - for entry in entries.flatten() { - let name = entry.file_name(); - let name_str = name.to_string_lossy(); - if !name_str.starts_with("pid-") { continue; } - let other_pid: u32 = name_str.strip_prefix("pid-") - .and_then(|s| s.parse().ok()) - .unwrap_or(0); - if other_pid == pid || other_pid == 0 { continue; } - // Check if the other process is alive - if unsafe { libc::kill(other_pid as i32, 0) } == 0 { - log(&format!("bail: another agent running (pid {})", other_pid)); - return Err(format!("bailed at step {} — competing agent pid {}", - step_idx + 1, other_pid)); - } else { - // Dead process — clean up stale pid file - fs::remove_file(entry.path()).ok(); - } - } + // Run bail script if defined + if let Some(ref script) = bail_script { + let status = std::process::Command::new(script) + .arg(&pid_path_for_bail) + .current_dir(&output_dir_for_bail) + .status() + .map_err(|e| format!("bail script {:?} failed: {}", script, e))?; + if !status.success() { + return Err(format!("bailed at step {}: {:?} exited {}", + step_idx + 1, script.file_name().unwrap_or_default(), + status.code().unwrap_or(-1))); } } Ok(()) }; - let output = llm::call_for_def_multi(def, &prompts, Some(&bail_fn), log)?; + let output = match llm::call_for_def_multi(def, &prompts, Some(&bail_fn), log) { + Ok(output) => output, + Err(e) => { + fs::remove_file(&pid_path).ok(); + return Err(e); + } + }; - // Clean up PID file fs::remove_file(&pid_path).ok(); Ok(AgentResult {