vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's
priority scheduler. Lower value = higher priority, with preemption.

Priority is set per-agent in the .agent header:
- interactive (runner): 0 (default, highest)
- surface-observe: 1 (near-realtime, watches conversation)
- all other agents: 10 (batch, default if not specified)

Requires vLLM started with --scheduling-policy priority.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-01 23:21:39 -04:00
parent 503e2995c1
commit c72eb4d528
8 changed files with 27 additions and 7 deletions

View file

@ -35,6 +35,7 @@ pub async fn call_api_with_tools(
prompts: &[String],
phases: &[String],
temperature: Option<f32>,
priority: i32,
tools: &[String],
bail_fn: Option<&(dyn Fn(usize) -> Result<(), String> + Sync)>,
log: &dyn Fn(&str),
@ -82,6 +83,7 @@ pub async fn call_api_with_tools(
&ui_tx,
&reasoning,
temperature,
Some(priority),
).await {
Ok((msg, usage)) => {
msg_opt = Some(msg);
@ -233,6 +235,7 @@ pub fn call_api_with_tools_sync(
prompts: &[String],
phases: &[String],
temperature: Option<f32>,
priority: i32,
tools: &[String],
bail_fn: Option<&(dyn Fn(usize) -> Result<(), String> + Sync)>,
log: &(dyn Fn(&str) + Sync),
@ -244,7 +247,7 @@ pub fn call_api_with_tools_sync(
.build()
.map_err(|e| format!("tokio runtime: {}", e))?;
rt.block_on(
call_api_with_tools(agent, prompts, phases, temperature, tools, bail_fn, log)
call_api_with_tools(agent, prompts, phases, temperature, priority, tools, bail_fn, log)
)
}).join().unwrap()
})