vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's priority scheduler. Lower value = higher priority, with preemption. Priority is set per-agent in the .agent header: - interactive (runner): 0 (default, highest) - surface-observe: 1 (near-realtime, watches conversation) - all other agents: 10 (batch, default if not specified) Requires vLLM started with --scheduling-policy priority. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-01 23:21:39 -04:00 · 2026-04-01 23:21:39 -04:00 · c72eb4d528
commit c72eb4d528
parent 503e2995c1
8 changed files with 27 additions and 7 deletions
--- a/src/subconscious/defs.rs
+++ b/src/subconscious/defs.rs
@ -46,6 +46,9 @@ pub struct AgentDef {
    pub chunk_size: Option<usize>,
    pub chunk_overlap: Option<usize>,
    pub temperature: Option<f32>,
+    /// vLLM scheduling priority (lower = higher priority).
+    /// 0 = interactive, 1 = near-realtime, 10 = batch (default).
+    pub priority: i32,
    /// Bail check command — run between steps with pid file path as $1,
    /// cwd = state dir. Non-zero exit = stop the pipeline.
    pub bail: Option<String>,
@ -75,6 +78,9 @@ struct AgentHeader {
    /// LLM temperature override
    #[serde(default)]
    temperature: Option<f32>,
+    /// vLLM scheduling priority (lower = higher priority, default 10 = batch)
+    #[serde(default = "default_priority")]
+    priority: i32,
    /// Bail check command — run between steps with pid file path as $1,
    /// cwd = state dir. Non-zero exit = stop the pipeline.
    #[serde(default)]
@ -82,6 +88,7 @@ struct AgentHeader {
 }

 fn default_model() -> String { "sonnet".into() }
+fn default_priority() -> i32 { 10 }

 /// Parse an agent file: first line is JSON config, rest is the prompt(s).
 /// Multiple prompts are separated by `=== PROMPT [phase:name] ===` lines.
@ -149,6 +156,7 @@ fn parse_agent_file(content: &str) -> Option<AgentDef> {
        chunk_size: header.chunk_size,
        chunk_overlap: header.chunk_overlap,
        temperature: header.temperature,
+        priority: header.priority,
        bail: header.bail,
    })
 }