vLLM priority scheduling for agents
Thread request priority through the API call chain to vLLM's priority scheduler. Lower value = higher priority, with preemption. Priority is set per-agent in the .agent header: - interactive (runner): 0 (default, highest) - surface-observe: 1 (near-realtime, watches conversation) - all other agents: 10 (batch, default if not specified) Requires vLLM started with --scheduling-policy priority. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
503e2995c1
commit
c72eb4d528
8 changed files with 27 additions and 7 deletions
|
|
@ -103,6 +103,7 @@ impl ApiClient {
|
|||
ui_tx: &UiSender,
|
||||
reasoning_effort: &str,
|
||||
temperature: Option<f32>,
|
||||
priority: Option<i32>,
|
||||
) -> mpsc::UnboundedReceiver<StreamEvent> {
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
let client = self.client.clone();
|
||||
|
|
@ -123,7 +124,7 @@ impl ApiClient {
|
|||
openai::stream_events(
|
||||
&client, base_url, &api_key, &model,
|
||||
&messages, tools.as_deref(), &tx, &ui_tx,
|
||||
&reasoning_effort, temperature,
|
||||
&reasoning_effort, temperature, priority,
|
||||
).await
|
||||
}
|
||||
Backend::Anthropic => {
|
||||
|
|
@ -155,7 +156,7 @@ impl ApiClient {
|
|||
ui_tx: &UiSender,
|
||||
reasoning_effort: &str,
|
||||
) -> Result<(Message, Option<Usage>)> {
|
||||
self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None).await
|
||||
self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None, None).await
|
||||
}
|
||||
|
||||
pub async fn chat_completion_stream_temp(
|
||||
|
|
@ -165,9 +166,10 @@ impl ApiClient {
|
|||
ui_tx: &UiSender,
|
||||
reasoning_effort: &str,
|
||||
temperature: Option<f32>,
|
||||
priority: Option<i32>,
|
||||
) -> Result<(Message, Option<Usage>)> {
|
||||
// Use the event stream and accumulate into a message.
|
||||
let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature);
|
||||
let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature, priority);
|
||||
let mut content = String::new();
|
||||
let mut tool_calls: Vec<ToolCall> = Vec::new();
|
||||
let mut usage = None;
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ pub async fn stream_events(
|
|||
ui_tx: &UiSender,
|
||||
reasoning_effort: &str,
|
||||
temperature: Option<f32>,
|
||||
priority: Option<i32>,
|
||||
) -> Result<()> {
|
||||
let request = ChatRequest {
|
||||
model: model.to_string(),
|
||||
|
|
@ -44,6 +45,7 @@ pub async fn stream_events(
|
|||
None
|
||||
},
|
||||
chat_template_kwargs: None,
|
||||
priority,
|
||||
};
|
||||
|
||||
let url = format!("{}/chat/completions", base_url);
|
||||
|
|
|
|||
|
|
@ -261,6 +261,7 @@ impl Agent {
|
|||
ui_tx,
|
||||
&self.reasoning_effort,
|
||||
None,
|
||||
None, // priority: interactive
|
||||
);
|
||||
|
||||
let mut content = String::new();
|
||||
|
|
|
|||
|
|
@ -132,6 +132,10 @@ pub struct ChatRequest {
|
|||
/// vllm chat template kwargs — used to disable thinking on Qwen 3.5
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub chat_template_kwargs: Option<serde_json::Value>,
|
||||
/// vllm request priority (lower = higher priority).
|
||||
/// 0 = interactive, 1 = surface-observe, 10 = batch agents.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub priority: Option<i32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue