vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's priority scheduler. Lower value = higher priority, with preemption. Priority is set per-agent in the .agent header: - interactive (runner): 0 (default, highest) - surface-observe: 1 (near-realtime, watches conversation) - all other agents: 10 (batch, default if not specified) Requires vLLM started with --scheduling-policy priority. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-01 23:21:39 -04:00 · 2026-04-01 23:21:39 -04:00 · c72eb4d528
commit c72eb4d528
parent 503e2995c1
8 changed files with 27 additions and 7 deletions
--- a/src/subconscious/api.rs
+++ b/src/subconscious/api.rs
@ -35,6 +35,7 @@ pub async fn call_api_with_tools(
    prompts: &[String],
    phases: &[String],
    temperature: Option<f32>,
+    priority: i32,
    tools: &[String],
    bail_fn: Option<&(dyn Fn(usize) -> Result<(), String> + Sync)>,
    log: &dyn Fn(&str),
@ -82,6 +83,7 @@ pub async fn call_api_with_tools(
                &ui_tx,
                &reasoning,
                temperature,
+                Some(priority),
            ).await {
                Ok((msg, usage)) => {
                    msg_opt = Some(msg);
@ -233,6 +235,7 @@ pub fn call_api_with_tools_sync(
    prompts: &[String],
    phases: &[String],
    temperature: Option<f32>,
+    priority: i32,
    tools: &[String],
    bail_fn: Option<&(dyn Fn(usize) -> Result<(), String> + Sync)>,
    log: &(dyn Fn(&str) + Sync),
@ -244,7 +247,7 @@ pub fn call_api_with_tools_sync(
                .build()
                .map_err(|e| format!("tokio runtime: {}", e))?;
            rt.block_on(
-                    call_api_with_tools(agent, prompts, phases, temperature, tools, bail_fn, log)
+                    call_api_with_tools(agent, prompts, phases, temperature, priority, tools, bail_fn, log)
            )
        }).join().unwrap()
    })