vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's
priority scheduler. Lower value = higher priority, with preemption.

Priority is set per-agent in the .agent header:
- interactive (runner): 0 (default, highest)
- surface-observe: 1 (near-realtime, watches conversation)
- all other agents: 10 (batch, default if not specified)

Requires vLLM started with --scheduling-policy priority.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-01 23:21:39 -04:00
parent 503e2995c1
commit c72eb4d528
8 changed files with 27 additions and 7 deletions

View file

@ -103,6 +103,7 @@ impl ApiClient {
ui_tx: &UiSender,
reasoning_effort: &str,
temperature: Option<f32>,
priority: Option<i32>,
) -> mpsc::UnboundedReceiver<StreamEvent> {
let (tx, rx) = mpsc::unbounded_channel();
let client = self.client.clone();
@ -123,7 +124,7 @@ impl ApiClient {
openai::stream_events(
&client, base_url, &api_key, &model,
&messages, tools.as_deref(), &tx, &ui_tx,
&reasoning_effort, temperature,
&reasoning_effort, temperature, priority,
).await
}
Backend::Anthropic => {
@ -155,7 +156,7 @@ impl ApiClient {
ui_tx: &UiSender,
reasoning_effort: &str,
) -> Result<(Message, Option<Usage>)> {
self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None).await
self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None, None).await
}
pub async fn chat_completion_stream_temp(
@ -165,9 +166,10 @@ impl ApiClient {
ui_tx: &UiSender,
reasoning_effort: &str,
temperature: Option<f32>,
priority: Option<i32>,
) -> Result<(Message, Option<Usage>)> {
// Use the event stream and accumulate into a message.
let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature);
let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature, priority);
let mut content = String::new();
let mut tool_calls: Vec<ToolCall> = Vec::new();
let mut usage = None;