vLLM priority scheduling for agents

Thread request priority through the API call chain to vLLM's priority scheduler. Lower value = higher priority, with preemption. Priority is set per-agent in the .agent header: - interactive (runner): 0 (default, highest) - surface-observe: 1 (near-realtime, watches conversation) - all other agents: 10 (batch, default if not specified) Requires vLLM started with --scheduling-policy priority. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-01 23:21:39 -04:00 · 2026-04-01 23:21:39 -04:00 · c72eb4d528
commit c72eb4d528
parent 503e2995c1
8 changed files with 27 additions and 7 deletions
--- a/src/subconscious/llm.rs
+++ b/src/subconscious/llm.rs
@ -22,7 +22,7 @@ pub(crate) fn call_simple(caller: &str, prompt: &str) -> Result<String, String>

    let prompts = vec![prompt.to_string()];
    let phases = vec![];
-    super::api::call_api_with_tools_sync(caller, &prompts, &phases, None, &[], None, &log)
+    super::api::call_api_with_tools_sync(caller, &prompts, &phases, None, 10, &[], None, &log)
 }

 /// Call a model using an agent definition's configuration (multi-step).
@ -34,7 +34,7 @@ pub(crate) fn call_for_def_multi(
    bail_fn: Option<&(dyn Fn(usize) -> Result<(), String> + Sync)>,
    log: &(dyn Fn(&str) + Sync),
 ) -> Result<String, String> {
-    super::api::call_api_with_tools_sync(&def.agent, prompts, phases, def.temperature, &def.tools, bail_fn, log)
+    super::api::call_api_with_tools_sync(&def.agent, prompts, phases, def.temperature, def.priority, &def.tools, bail_fn, log)
 }