agents: bail script support, pid file simplification, cleanup
- Bail command moved from hardcoded closure to external script
specified in agent JSON header ("bail": "bail-no-competing.sh")
- Runner executes script between steps with pid file path as $1,
cwd = state dir. Non-zero exit stops the pipeline.
- PID files simplified to just the phase name (no JSON) for easy
bash inspection (cat pid-*)
- scan_pid_files helper deduplicates pid scanning logic
- Timeout check uses file mtime instead of embedded timestamp
- PID file cleaned up on bail/error (not just success)
- output() tool validates key names (rejects pid-*, /, ..)
- Agent log files append instead of truncate
- Fixed orphaned derive and doc comment on AgentStep/AgentDef
- Phase written after bail check passes, not before
Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
e20aeeeabe
commit
52703b4637
5 changed files with 135 additions and 85 deletions
|
|
@ -139,6 +139,9 @@ pub fn dispatch(name: &str, args: &serde_json::Value, provenance: Option<&str>)
|
||||||
}
|
}
|
||||||
"output" => {
|
"output" => {
|
||||||
let key = get_str(args, "key")?;
|
let key = get_str(args, "key")?;
|
||||||
|
if key.starts_with("pid-") || key.contains('/') || key.contains("..") {
|
||||||
|
anyhow::bail!("invalid output key: {}", key);
|
||||||
|
}
|
||||||
let value = get_str(args, "value")?;
|
let value = get_str(args, "value")?;
|
||||||
let dir = std::env::var("POC_AGENT_OUTPUT_DIR")
|
let dir = std::env::var("POC_AGENT_OUTPUT_DIR")
|
||||||
.map_err(|_| anyhow::anyhow!("no output directory set"))?;
|
.map_err(|_| anyhow::anyhow!("no output directory set"))?;
|
||||||
|
|
|
||||||
|
|
@ -129,6 +129,46 @@ fn mark_seen(dir: &Path, session_id: &str, key: &str, seen: &mut HashSet<String>
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check for live agent processes in a state dir. Returns (phase, pid) pairs.
|
||||||
|
/// Cleans up stale pid files and kills timed-out processes.
|
||||||
|
fn scan_pid_files(state_dir: &Path, timeout_secs: u64, self_pid: u32) -> Vec<(String, u32)> {
|
||||||
|
let mut live = Vec::new();
|
||||||
|
let Ok(entries) = fs::read_dir(state_dir) else { return live };
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
let name = entry.file_name();
|
||||||
|
let name_str = name.to_string_lossy();
|
||||||
|
if !name_str.starts_with("pid-") { continue; }
|
||||||
|
let pid: u32 = name_str.strip_prefix("pid-")
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(0);
|
||||||
|
if pid == 0 || pid == self_pid { continue; }
|
||||||
|
|
||||||
|
if unsafe { libc::kill(pid as i32, 0) } != 0 {
|
||||||
|
fs::remove_file(entry.path()).ok();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Timeout via mtime
|
||||||
|
if timeout_secs > 0 {
|
||||||
|
if let Ok(meta) = entry.metadata() {
|
||||||
|
if let Ok(modified) = meta.modified() {
|
||||||
|
if modified.elapsed().unwrap_or_default().as_secs() > timeout_secs {
|
||||||
|
unsafe { libc::kill(pid as i32, libc::SIGTERM); }
|
||||||
|
fs::remove_file(entry.path()).ok();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let phase = fs::read_to_string(entry.path())
|
||||||
|
.unwrap_or_default()
|
||||||
|
.trim().to_string();
|
||||||
|
live.push((phase, pid));
|
||||||
|
}
|
||||||
|
live
|
||||||
|
}
|
||||||
|
|
||||||
/// Unified agent cycle — runs surface-observe agent with state dir.
|
/// Unified agent cycle — runs surface-observe agent with state dir.
|
||||||
/// Reads output files for surface results, spawns new agent when ready.
|
/// Reads output files for surface results, spawns new agent when ready.
|
||||||
///
|
///
|
||||||
|
|
@ -144,50 +184,12 @@ fn surface_observe_cycle(session: &Session, out: &mut String, log_f: &mut File)
|
||||||
.surface_timeout_secs
|
.surface_timeout_secs
|
||||||
.unwrap_or(300) as u64;
|
.unwrap_or(300) as u64;
|
||||||
|
|
||||||
// Scan pid files — find live agents and their phases
|
let live = scan_pid_files(&state_dir, timeout, 0);
|
||||||
let mut any_in_surface = false;
|
for (phase, pid) in &live {
|
||||||
let mut any_alive = false;
|
let _ = writeln!(log_f, "alive pid-{}: phase={}", pid, phase);
|
||||||
if let Ok(entries) = fs::read_dir(&state_dir) {
|
|
||||||
for entry in entries.flatten() {
|
|
||||||
let name = entry.file_name();
|
|
||||||
let name_str = name.to_string_lossy();
|
|
||||||
if !name_str.starts_with("pid-") { continue; }
|
|
||||||
let pid: u32 = name_str.strip_prefix("pid-")
|
|
||||||
.and_then(|s| s.parse().ok())
|
|
||||||
.unwrap_or(0);
|
|
||||||
if pid == 0 { continue; }
|
|
||||||
|
|
||||||
let alive = unsafe { libc::kill(pid as i32, 0) == 0 };
|
|
||||||
if !alive {
|
|
||||||
let _ = writeln!(log_f, "cleanup stale pid-{}", pid);
|
|
||||||
fs::remove_file(entry.path()).ok();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for timeout
|
|
||||||
let phase_json = fs::read_to_string(entry.path()).unwrap_or_default();
|
|
||||||
let started: u64 = phase_json.split("\"started\":")
|
|
||||||
.nth(1)
|
|
||||||
.and_then(|s| s.trim_start().split(|c: char| !c.is_ascii_digit()).next())
|
|
||||||
.and_then(|s| s.parse().ok())
|
|
||||||
.unwrap_or(0);
|
|
||||||
if started > 0 && now_secs().saturating_sub(started) > timeout {
|
|
||||||
let _ = writeln!(log_f, "killing timed-out pid-{} ({}s)", pid, timeout);
|
|
||||||
unsafe { libc::kill(pid as i32, libc::SIGTERM); }
|
|
||||||
fs::remove_file(entry.path()).ok();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
any_alive = true;
|
|
||||||
|
|
||||||
let in_surface = phase_json.contains("\"phase\":\"surface\"")
|
|
||||||
|| phase_json.contains("\"phase\":\"step-0\"");
|
|
||||||
if in_surface {
|
|
||||||
any_in_surface = true;
|
|
||||||
}
|
|
||||||
let _ = writeln!(log_f, "alive pid-{}: {}", pid, phase_json.trim());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
let any_in_surface = live.iter().any(|(p, _)| p == "surface" || p == "step-0");
|
||||||
|
let any_alive = !live.is_empty();
|
||||||
|
|
||||||
// Read surface output and inject into context
|
// Read surface output and inject into context
|
||||||
let surface_path = state_dir.join("surface");
|
let surface_path = state_dir.join("surface");
|
||||||
|
|
@ -230,21 +232,39 @@ fn surface_observe_cycle(session: &Session, out: &mut String, log_f: &mut File)
|
||||||
let _ = writeln!(log_f, "agent past surface, starting new (pipeline)");
|
let _ = writeln!(log_f, "agent past surface, starting new (pipeline)");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(pid) = spawn_agent("surface-observe", &state_dir, &session.session_id) {
|
||||||
|
let _ = writeln!(log_f, "spawned pid {}", pid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn an agent asynchronously. Reads the .agent file to get the first
|
||||||
|
/// phase name, spawns the process, writes the pid file, and returns.
|
||||||
|
fn spawn_agent(agent_name: &str, state_dir: &Path, session_id: &str) -> Option<u32> {
|
||||||
|
// Read first phase from agent definition
|
||||||
|
let first_phase = crate::agents::defs::get_def(agent_name)
|
||||||
|
.and_then(|d| d.steps.first().map(|s| s.phase.clone()))
|
||||||
|
.unwrap_or_else(|| "step-0".into());
|
||||||
|
|
||||||
let log_dir = crate::store::memory_dir().join("logs");
|
let log_dir = crate::store::memory_dir().join("logs");
|
||||||
fs::create_dir_all(&log_dir).ok();
|
fs::create_dir_all(&log_dir).ok();
|
||||||
let agent_log = fs::File::create(log_dir.join("surface-observe.log"))
|
let agent_log = fs::OpenOptions::new()
|
||||||
|
.create(true).append(true)
|
||||||
|
.open(log_dir.join(format!("{}.log", agent_name)))
|
||||||
.unwrap_or_else(|_| fs::File::create("/dev/null").unwrap());
|
.unwrap_or_else(|_| fs::File::create("/dev/null").unwrap());
|
||||||
|
|
||||||
if let Ok(child) = Command::new("poc-memory")
|
let child = Command::new("poc-memory")
|
||||||
.args(["agent", "run", "surface-observe", "--count", "1", "--local",
|
.args(["agent", "run", agent_name, "--count", "1", "--local",
|
||||||
"--state-dir", &state_dir.to_string_lossy()])
|
"--state-dir", &state_dir.to_string_lossy()])
|
||||||
.env("POC_SESSION_ID", &session.session_id)
|
.env("POC_SESSION_ID", session_id)
|
||||||
.stdout(agent_log.try_clone().unwrap_or_else(|_| fs::File::create("/dev/null").unwrap()))
|
.stdout(agent_log.try_clone().unwrap_or_else(|_| fs::File::create("/dev/null").unwrap()))
|
||||||
.stderr(agent_log)
|
.stderr(agent_log)
|
||||||
.spawn()
|
.spawn()
|
||||||
{
|
.ok()?;
|
||||||
let _ = writeln!(log_f, "spawned pid {}", child.id());
|
|
||||||
}
|
let pid = child.id();
|
||||||
|
let pid_path = state_dir.join(format!("pid-{}", pid));
|
||||||
|
fs::write(&pid_path, &first_phase).ok();
|
||||||
|
Some(pid)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cleanup_stale_files(dir: &Path, max_age: Duration) {
|
fn cleanup_stale_files(dir: &Path, max_age: Duration) {
|
||||||
|
|
|
||||||
22
src/subconscious/agents/bail-no-competing.sh
Executable file
22
src/subconscious/agents/bail-no-competing.sh
Executable file
|
|
@ -0,0 +1,22 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Bail if other agents are alive in the state dir.
|
||||||
|
# $1 = path to this agent's pid file
|
||||||
|
# cwd = state dir
|
||||||
|
#
|
||||||
|
# Exit 0 = continue, exit 1 = bail
|
||||||
|
|
||||||
|
my_pid_file=$(basename "$1")
|
||||||
|
|
||||||
|
for f in pid-*; do
|
||||||
|
[ "$f" = "$my_pid_file" ] && continue
|
||||||
|
[ ! -f "$f" ] && continue
|
||||||
|
|
||||||
|
pid=${f#pid-}
|
||||||
|
if kill -0 "$pid" 2>/dev/null; then
|
||||||
|
exit 1 # another agent is alive, bail
|
||||||
|
else
|
||||||
|
rm -f "$f" # stale, clean up
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
@ -24,8 +24,6 @@ use serde::Deserialize;
|
||||||
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
/// Agent definition: config (from JSON header) + prompt (raw markdown body).
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
/// A single step in a multi-step agent.
|
/// A single step in a multi-step agent.
|
||||||
pub struct AgentStep {
|
pub struct AgentStep {
|
||||||
pub prompt: String,
|
pub prompt: String,
|
||||||
|
|
@ -34,6 +32,7 @@ pub struct AgentStep {
|
||||||
pub phase: String,
|
pub phase: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Agent definition: config (from JSON header) + prompt steps.
|
||||||
pub struct AgentDef {
|
pub struct AgentDef {
|
||||||
pub agent: String,
|
pub agent: String,
|
||||||
pub query: String,
|
pub query: String,
|
||||||
|
|
@ -47,6 +46,9 @@ pub struct AgentDef {
|
||||||
pub chunk_size: Option<usize>,
|
pub chunk_size: Option<usize>,
|
||||||
pub chunk_overlap: Option<usize>,
|
pub chunk_overlap: Option<usize>,
|
||||||
pub temperature: Option<f32>,
|
pub temperature: Option<f32>,
|
||||||
|
/// Bail check command — run between steps with pid file path as $1,
|
||||||
|
/// cwd = state dir. Non-zero exit = stop the pipeline.
|
||||||
|
pub bail: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The JSON header portion (first line of the file).
|
/// The JSON header portion (first line of the file).
|
||||||
|
|
@ -73,6 +75,10 @@ struct AgentHeader {
|
||||||
/// LLM temperature override
|
/// LLM temperature override
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
temperature: Option<f32>,
|
temperature: Option<f32>,
|
||||||
|
/// Bail check command — run between steps with pid file path as $1,
|
||||||
|
/// cwd = state dir. Non-zero exit = stop the pipeline.
|
||||||
|
#[serde(default)]
|
||||||
|
bail: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_model() -> String { "sonnet".into() }
|
fn default_model() -> String { "sonnet".into() }
|
||||||
|
|
@ -143,10 +149,11 @@ fn parse_agent_file(content: &str) -> Option<AgentDef> {
|
||||||
chunk_size: header.chunk_size,
|
chunk_size: header.chunk_size,
|
||||||
chunk_overlap: header.chunk_overlap,
|
chunk_overlap: header.chunk_overlap,
|
||||||
temperature: header.temperature,
|
temperature: header.temperature,
|
||||||
|
bail: header.bail,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn agents_dir() -> PathBuf {
|
pub fn agents_dir() -> PathBuf {
|
||||||
let repo = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/subconscious/agents");
|
let repo = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/subconscious/agents");
|
||||||
if repo.is_dir() { return repo; }
|
if repo.is_dir() { return repo; }
|
||||||
crate::store::memory_dir().join("agents")
|
crate::store::memory_dir().join("agents")
|
||||||
|
|
|
||||||
|
|
@ -174,15 +174,11 @@ fn run_one_agent_inner(
|
||||||
// Safe: agent runs single-threaded, env var read only by our dispatch code
|
// Safe: agent runs single-threaded, env var read only by our dispatch code
|
||||||
unsafe { std::env::set_var("POC_AGENT_OUTPUT_DIR", &output_dir); }
|
unsafe { std::env::set_var("POC_AGENT_OUTPUT_DIR", &output_dir); }
|
||||||
|
|
||||||
// Write PID file with initial phase
|
// Write PID file — content is just the phase name
|
||||||
let pid = std::process::id();
|
let pid = std::process::id();
|
||||||
let pid_path = output_dir.join(format!("pid-{}", pid));
|
let pid_path = output_dir.join(format!("pid-{}", pid));
|
||||||
let write_pid = |phase: &str| {
|
let write_pid = |phase: &str| {
|
||||||
let json = format!("{{\"phase\":\"{}\",\"started\":{}}}", phase,
|
fs::write(&pid_path, phase).ok();
|
||||||
std::time::SystemTime::now()
|
|
||||||
.duration_since(std::time::UNIX_EPOCH)
|
|
||||||
.unwrap().as_secs());
|
|
||||||
fs::write(&pid_path, &json).ok();
|
|
||||||
};
|
};
|
||||||
write_pid(&agent_batch.steps[0].phase);
|
write_pid(&agent_batch.steps[0].phase);
|
||||||
|
|
||||||
|
|
@ -201,42 +197,44 @@ fn run_one_agent_inner(
|
||||||
}
|
}
|
||||||
log("\n=== CALLING LLM ===");
|
log("\n=== CALLING LLM ===");
|
||||||
|
|
||||||
// Bail check: between steps, check for other pid files in the state dir.
|
// Bail check: if the agent defines a bail script, run it between steps.
|
||||||
// If another agent has started, bail — let it have the resources.
|
// The script receives the pid file path as $1, cwd = state dir.
|
||||||
let output_dir_clone = output_dir.clone();
|
let bail_script = def.bail.as_ref().map(|name| {
|
||||||
|
// Look for the script next to the .agent file
|
||||||
|
let agents_dir = super::defs::agents_dir();
|
||||||
|
agents_dir.join(name)
|
||||||
|
});
|
||||||
|
let output_dir_for_bail = output_dir.clone();
|
||||||
|
let pid_path_for_bail = pid_path.clone();
|
||||||
let bail_fn = move |step_idx: usize| -> Result<(), String> {
|
let bail_fn = move |step_idx: usize| -> Result<(), String> {
|
||||||
|
// Update phase
|
||||||
if step_idx < step_phases.len() {
|
if step_idx < step_phases.len() {
|
||||||
write_pid(&step_phases[step_idx]);
|
write_pid(&step_phases[step_idx]);
|
||||||
}
|
}
|
||||||
// After step 0 (surface), check for competing agents
|
// Run bail script if defined
|
||||||
if step_idx > 0 {
|
if let Some(ref script) = bail_script {
|
||||||
if let Ok(entries) = fs::read_dir(&output_dir_clone) {
|
let status = std::process::Command::new(script)
|
||||||
for entry in entries.flatten() {
|
.arg(&pid_path_for_bail)
|
||||||
let name = entry.file_name();
|
.current_dir(&output_dir_for_bail)
|
||||||
let name_str = name.to_string_lossy();
|
.status()
|
||||||
if !name_str.starts_with("pid-") { continue; }
|
.map_err(|e| format!("bail script {:?} failed: {}", script, e))?;
|
||||||
let other_pid: u32 = name_str.strip_prefix("pid-")
|
if !status.success() {
|
||||||
.and_then(|s| s.parse().ok())
|
return Err(format!("bailed at step {}: {:?} exited {}",
|
||||||
.unwrap_or(0);
|
step_idx + 1, script.file_name().unwrap_or_default(),
|
||||||
if other_pid == pid || other_pid == 0 { continue; }
|
status.code().unwrap_or(-1)));
|
||||||
// Check if the other process is alive
|
|
||||||
if unsafe { libc::kill(other_pid as i32, 0) } == 0 {
|
|
||||||
log(&format!("bail: another agent running (pid {})", other_pid));
|
|
||||||
return Err(format!("bailed at step {} — competing agent pid {}",
|
|
||||||
step_idx + 1, other_pid));
|
|
||||||
} else {
|
|
||||||
// Dead process — clean up stale pid file
|
|
||||||
fs::remove_file(entry.path()).ok();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
};
|
};
|
||||||
|
|
||||||
let output = llm::call_for_def_multi(def, &prompts, Some(&bail_fn), log)?;
|
let output = match llm::call_for_def_multi(def, &prompts, Some(&bail_fn), log) {
|
||||||
|
Ok(output) => output,
|
||||||
|
Err(e) => {
|
||||||
|
fs::remove_file(&pid_path).ok();
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Clean up PID file
|
|
||||||
fs::remove_file(&pid_path).ok();
|
fs::remove_file(&pid_path).ok();
|
||||||
|
|
||||||
Ok(AgentResult {
|
Ok(AgentResult {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue