vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's priority scheduler. Lower value = higher priority, with preemption. Priority is set per-agent in the .agent header: - interactive (runner): 0 (default, highest) - surface-observe: 1 (near-realtime, watches conversation) - all other agents: 10 (batch, default if not specified) Requires vLLM started with --scheduling-policy priority. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-01 23:21:39 -04:00 · 2026-04-01 23:21:39 -04:00 · c72eb4d528
commit c72eb4d528
parent 503e2995c1
8 changed files with 27 additions and 7 deletions
--- a/src/agent/api/mod.rs
+++ b/src/agent/api/mod.rs
@ -103,6 +103,7 @@ impl ApiClient {
        ui_tx: &UiSender,
        reasoning_effort: &str,
        temperature: Option<f32>,
+        priority: Option<i32>,
    ) -> mpsc::UnboundedReceiver<StreamEvent> {
        let (tx, rx) = mpsc::unbounded_channel();
        let client = self.client.clone();
@ -123,7 +124,7 @@ impl ApiClient {
                    openai::stream_events(
                        &client, base_url, &api_key, &model,
                        &messages, tools.as_deref(), &tx, &ui_tx,
-                        &reasoning_effort, temperature,
+                        &reasoning_effort, temperature, priority,
                    ).await
                }
                Backend::Anthropic => {
@ -155,7 +156,7 @@ impl ApiClient {
        ui_tx: &UiSender,
        reasoning_effort: &str,
    ) -> Result<(Message, Option<Usage>)> {
-        self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None).await
+        self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None, None).await
    }

    pub async fn chat_completion_stream_temp(
@ -165,9 +166,10 @@ impl ApiClient {
        ui_tx: &UiSender,
        reasoning_effort: &str,
        temperature: Option<f32>,
+        priority: Option<i32>,
    ) -> Result<(Message, Option<Usage>)> {
        // Use the event stream and accumulate into a message.
-        let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature);
+        let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature, priority);
        let mut content = String::new();
        let mut tool_calls: Vec<ToolCall> = Vec::new();
        let mut usage = None;