From 0e6b5dc8be5d869248067615a759ba27d7747b05 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Apr 2026 15:41:28 -0400 Subject: [PATCH] agent: phase-aware bail script for surface-observe concurrency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bail-no-competing.sh used to bail if any other live agent existed in the state dir, period. That was too coarse: surface-observe agents run a multi-step pipeline (surface → organize-search → organize-new → observe), and the intent is to let a new surface-phase agent start while an older one finishes its post-surface tail. With the old check the newer agent always bailed, so surface-observe was effectively serialized at the slowest cycle time. Make the script phase-aware: - oneshot.rs now passes the current phase as argv[2] alongside the pid file name. The script writes that phase into its own pid file on every step transition, so concurrent agents can read each other's phase just by cat'ing the pid files. - Bail only when another live agent is in the same phase-group as us. Groups: "surface" vs. "everything else" (post-surface). At most one agent per group alive at a time — surface runs at a higher cadence than the organize/observe tail. - Still clean up stale pid files for dead processes. Co-Authored-By: Proof of Concept --- src/agent/oneshot.rs | 7 ++- src/subconscious/agents/bail-no-competing.sh | 46 ++++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/agent/oneshot.rs b/src/agent/oneshot.rs index 2fce906..0f04e4d 100644 --- a/src/agent/oneshot.rs +++ b/src/agent/oneshot.rs @@ -497,15 +497,20 @@ pub async fn run_one_agent( .map(|s| s.phase.clone()).collect(); // Bail check: if the agent defines a bail script, run it between steps. + // The script also refreshes our pid-file with the current phase — that's + // how concurrent agents know which phase each of us is in. let bail_script = def.bail.as_ref().map(|name| defs::agents_dir().join(name)); let state_dir_for_bail = state_dir.clone(); - // Find our own pid file so we can pass it to the bail script let our_pid = std::process::id(); let our_pid_file = format!("pid-{}", our_pid); + let step_phases_for_bail = step_phases.clone(); let bail_fn = move |step_idx: usize| -> Result<(), String> { if let Some(ref script) = bail_script { + let phase = step_phases_for_bail.get(step_idx) + .map(String::as_str).unwrap_or(""); let status = std::process::Command::new(script) .arg(&our_pid_file) + .arg(phase) .current_dir(&state_dir_for_bail) .status() .map_err(|e| format!("bail script {:?} failed: {}", script, e))?; diff --git a/src/subconscious/agents/bail-no-competing.sh b/src/subconscious/agents/bail-no-competing.sh index 43c3096..95b8219 100755 --- a/src/subconscious/agents/bail-no-competing.sh +++ b/src/subconscious/agents/bail-no-competing.sh @@ -1,21 +1,49 @@ #!/bin/bash -# Bail if other agents are alive in the state dir. -# $1 = this agent's pid file name (e.g. pid-12345) -# cwd = state dir +# Bail if another agent is in the same phase-group as us. # -# Exit 0 = continue, exit 1 = bail +# $1 = our pid file name (e.g. "pid-12345") +# $2 = the phase we're about to enter (e.g. "surface", "observe") +# cwd = state dir +# +# Also refreshes our own pid file with the current phase on each call, +# so concurrent agents can read each other's phase by cat'ing the pid +# files in the state dir. +# +# Phase groups: "surface" vs everything else ("post-surface"). We allow +# at most one agent per group to be alive at a time — so surface can run +# at a higher frequency than the slower organize/observe tail. +# +# Exit 0 = continue, exit 1 = bail (another agent in our group is alive). shopt -s nullglob my_pid_file="$1" +my_phase="$2" + +# Refresh our own pid file with the current phase. +printf '%s' "$my_phase" > "$my_pid_file" + +group_of() { + if [[ "$1" == "surface" ]]; then + echo "surface" + else + echo "post-surface" + fi +} + +my_group=$(group_of "$my_phase") for f in pid-*; do - [[ $f == $my_pid_file ]] && continue + [[ "$f" == "$my_pid_file" ]] && continue pid="${f#pid-}" - if kill -0 "$pid" 2>/dev/null; then - exit 1 # competing agent is alive - else - rm -f "$f" # stale pid file, clean up + if ! kill -0 "$pid" 2>/dev/null; then + rm -f "$f" # stale pid file, clean up + continue + fi + other_phase=$(cat "$f" 2>/dev/null) + other_group=$(group_of "$other_phase") + if [[ "$my_group" == "$other_group" ]]; then + exit 1 fi done