vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's priority scheduler. Lower value = higher priority, with preemption. Priority is set per-agent in the .agent header: - interactive (runner): 0 (default, highest) - surface-observe: 1 (near-realtime, watches conversation) - all other agents: 10 (batch, default if not specified) Requires vLLM started with --scheduling-policy priority. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-01 23:21:39 -04:00 · 2026-04-01 23:21:39 -04:00 · c72eb4d528
commit c72eb4d528
parent 503e2995c1
8 changed files with 27 additions and 7 deletions
--- a/src/agent/api/mod.rs
+++ b/src/agent/api/mod.rs
@ -103,6 +103,7 @@ impl ApiClient {
        ui_tx: &UiSender,
        reasoning_effort: &str,
        temperature: Option<f32>,
+        priority: Option<i32>,
    ) -> mpsc::UnboundedReceiver<StreamEvent> {
        let (tx, rx) = mpsc::unbounded_channel();
        let client = self.client.clone();
@ -123,7 +124,7 @@ impl ApiClient {
                    openai::stream_events(
                        &client, base_url, &api_key, &model,
                        &messages, tools.as_deref(), &tx, &ui_tx,
-                        &reasoning_effort, temperature,
+                        &reasoning_effort, temperature, priority,
                    ).await
                }
                Backend::Anthropic => {
@ -155,7 +156,7 @@ impl ApiClient {
        ui_tx: &UiSender,
        reasoning_effort: &str,
    ) -> Result<(Message, Option<Usage>)> {
-        self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None).await
+        self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None, None).await
    }

    pub async fn chat_completion_stream_temp(
@ -165,9 +166,10 @@ impl ApiClient {
        ui_tx: &UiSender,
        reasoning_effort: &str,
        temperature: Option<f32>,
+        priority: Option<i32>,
    ) -> Result<(Message, Option<Usage>)> {
        // Use the event stream and accumulate into a message.
-        let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature);
+        let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature, priority);
        let mut content = String::new();
        let mut tool_calls: Vec<ToolCall> = Vec::new();
        let mut usage = None;
--- a/src/agent/api/openai.rs
+++ b/src/agent/api/openai.rs
@ -26,6 +26,7 @@ pub async fn stream_events(
    ui_tx: &UiSender,
    reasoning_effort: &str,
    temperature: Option<f32>,
+    priority: Option<i32>,
 ) -> Result<()> {
    let request = ChatRequest {
        model: model.to_string(),
@ -44,6 +45,7 @@ pub async fn stream_events(
            None
        },
        chat_template_kwargs: None,
+        priority,
    };

    let url = format!("{}/chat/completions", base_url);
--- a/src/agent/runner.rs
+++ b/src/agent/runner.rs
@ -261,6 +261,7 @@ impl Agent {
                ui_tx,
                &self.reasoning_effort,
                None,
+                None, // priority: interactive
            );

            let mut content = String::new();
--- a/src/agent/types.rs
+++ b/src/agent/types.rs
@ -132,6 +132,10 @@ pub struct ChatRequest {
    /// vllm chat template kwargs — used to disable thinking on Qwen 3.5
    #[serde(skip_serializing_if = "Option::is_none")]
    pub chat_template_kwargs: Option<serde_json::Value>,
+    /// vllm request priority (lower = higher priority).
+    /// 0 = interactive, 1 = surface-observe, 10 = batch agents.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub priority: Option<i32>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]