agent: phase-aware bail script for surface-observe concurrency

bail-no-competing.sh used to bail if any other live agent existed in
the state dir, period. That was too coarse: surface-observe agents run
a multi-step pipeline (surface → organize-search → organize-new →
observe), and the intent is to let a new surface-phase agent start
while an older one finishes its post-surface tail. With the old check
the newer agent always bailed, so surface-observe was effectively
serialized at the slowest cycle time.

Make the script phase-aware:

- oneshot.rs now passes the current phase as argv[2] alongside the pid
  file name. The script writes that phase into its own pid file on
  every step transition, so concurrent agents can read each other's
  phase just by cat'ing the pid files.

- Bail only when another live agent is in the same phase-group as us.
  Groups: "surface" vs. "everything else" (post-surface). At most one
  agent per group alive at a time — surface runs at a higher cadence
  than the organize/observe tail.

- Still clean up stale pid files for dead processes.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-16 15:41:28 -04:00
parent 2eddf3b4cf
commit 0e6b5dc8be
2 changed files with 43 additions and 10 deletions

View file

@ -497,15 +497,20 @@ pub async fn run_one_agent(
.map(|s| s.phase.clone()).collect(); .map(|s| s.phase.clone()).collect();
// Bail check: if the agent defines a bail script, run it between steps. // Bail check: if the agent defines a bail script, run it between steps.
// The script also refreshes our pid-file with the current phase — that's
// how concurrent agents know which phase each of us is in.
let bail_script = def.bail.as_ref().map(|name| defs::agents_dir().join(name)); let bail_script = def.bail.as_ref().map(|name| defs::agents_dir().join(name));
let state_dir_for_bail = state_dir.clone(); let state_dir_for_bail = state_dir.clone();
// Find our own pid file so we can pass it to the bail script
let our_pid = std::process::id(); let our_pid = std::process::id();
let our_pid_file = format!("pid-{}", our_pid); let our_pid_file = format!("pid-{}", our_pid);
let step_phases_for_bail = step_phases.clone();
let bail_fn = move |step_idx: usize| -> Result<(), String> { let bail_fn = move |step_idx: usize| -> Result<(), String> {
if let Some(ref script) = bail_script { if let Some(ref script) = bail_script {
let phase = step_phases_for_bail.get(step_idx)
.map(String::as_str).unwrap_or("");
let status = std::process::Command::new(script) let status = std::process::Command::new(script)
.arg(&our_pid_file) .arg(&our_pid_file)
.arg(phase)
.current_dir(&state_dir_for_bail) .current_dir(&state_dir_for_bail)
.status() .status()
.map_err(|e| format!("bail script {:?} failed: {}", script, e))?; .map_err(|e| format!("bail script {:?} failed: {}", script, e))?;

View file

@ -1,21 +1,49 @@
#!/bin/bash #!/bin/bash
# Bail if other agents are alive in the state dir. # Bail if another agent is in the same phase-group as us.
# $1 = this agent's pid file name (e.g. pid-12345)
# cwd = state dir
# #
# Exit 0 = continue, exit 1 = bail # $1 = our pid file name (e.g. "pid-12345")
# $2 = the phase we're about to enter (e.g. "surface", "observe")
# cwd = state dir
#
# Also refreshes our own pid file with the current phase on each call,
# so concurrent agents can read each other's phase by cat'ing the pid
# files in the state dir.
#
# Phase groups: "surface" vs everything else ("post-surface"). We allow
# at most one agent per group to be alive at a time — so surface can run
# at a higher frequency than the slower organize/observe tail.
#
# Exit 0 = continue, exit 1 = bail (another agent in our group is alive).
shopt -s nullglob shopt -s nullglob
my_pid_file="$1" my_pid_file="$1"
my_phase="$2"
# Refresh our own pid file with the current phase.
printf '%s' "$my_phase" > "$my_pid_file"
group_of() {
if [[ "$1" == "surface" ]]; then
echo "surface"
else
echo "post-surface"
fi
}
my_group=$(group_of "$my_phase")
for f in pid-*; do for f in pid-*; do
[[ $f == $my_pid_file ]] && continue [[ "$f" == "$my_pid_file" ]] && continue
pid="${f#pid-}" pid="${f#pid-}"
if kill -0 "$pid" 2>/dev/null; then if ! kill -0 "$pid" 2>/dev/null; then
exit 1 # competing agent is alive rm -f "$f" # stale pid file, clean up
else continue
rm -f "$f" # stale pid file, clean up fi
other_phase=$(cat "$f" 2>/dev/null)
other_group=$(group_of "$other_phase")
if [[ "$my_group" == "$other_group" ]]; then
exit 1
fi fi
done done