vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's
priority scheduler. Lower value = higher priority, with preemption.

Priority is set per-agent in the .agent header:
- interactive (runner): 0 (default, highest)
- surface-observe: 1 (near-realtime, watches conversation)
- all other agents: 10 (batch, default if not specified)

Requires vLLM started with --scheduling-policy priority.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-01 23:21:39 -04:00
parent 503e2995c1
commit c72eb4d528
8 changed files with 27 additions and 7 deletions

View file

@ -46,6 +46,9 @@ pub struct AgentDef {
pub chunk_size: Option<usize>,
pub chunk_overlap: Option<usize>,
pub temperature: Option<f32>,
/// vLLM scheduling priority (lower = higher priority).
/// 0 = interactive, 1 = near-realtime, 10 = batch (default).
pub priority: i32,
/// Bail check command — run between steps with pid file path as $1,
/// cwd = state dir. Non-zero exit = stop the pipeline.
pub bail: Option<String>,
@ -75,6 +78,9 @@ struct AgentHeader {
/// LLM temperature override
#[serde(default)]
temperature: Option<f32>,
/// vLLM scheduling priority (lower = higher priority, default 10 = batch)
#[serde(default = "default_priority")]
priority: i32,
/// Bail check command — run between steps with pid file path as $1,
/// cwd = state dir. Non-zero exit = stop the pipeline.
#[serde(default)]
@ -82,6 +88,7 @@ struct AgentHeader {
}
fn default_model() -> String { "sonnet".into() }
fn default_priority() -> i32 { 10 }
/// Parse an agent file: first line is JSON config, rest is the prompt(s).
/// Multiple prompts are separated by `=== PROMPT [phase:name] ===` lines.
@ -149,6 +156,7 @@ fn parse_agent_file(content: &str) -> Option<AgentDef> {
chunk_size: header.chunk_size,
chunk_overlap: header.chunk_overlap,
temperature: header.temperature,
priority: header.priority,
bail: header.bail,
})
}