diff --git a/poc-agent/src/api/openai.rs b/poc-agent/src/api/openai.rs index e34dc5d..e40f59e 100644 --- a/poc-agent/src/api/openai.rs +++ b/poc-agent/src/api/openai.rs @@ -34,6 +34,7 @@ pub async fn stream( enabled: reasoning_effort != "none", effort: Some(reasoning_effort.to_string()), }), + chat_template_kwargs: None, }; let url = format!("{}/chat/completions", base_url); @@ -96,14 +97,14 @@ pub async fn stream( if let Some(ref r) = choice.delta.reasoning_content { reasoning_chars += r.len(); has_reasoning = true; - if reasoning_enabled && !r.is_empty() { + if !r.is_empty() { let _ = ui_tx.send(UiMessage::Reasoning(r.clone())); } } if let Some(ref r) = choice.delta.reasoning { reasoning_chars += r.len(); has_reasoning = true; - if reasoning_enabled && !r.is_empty() { + if !r.is_empty() { let _ = ui_tx.send(UiMessage::Reasoning(r.clone())); } } @@ -111,7 +112,7 @@ pub async fn stream( let s = r.to_string(); reasoning_chars += s.len(); has_reasoning = true; - if reasoning_enabled && !s.is_empty() && s != "null" { + if !s.is_empty() && s != "null" { let _ = ui_tx.send(UiMessage::Reasoning(s)); } } diff --git a/poc-agent/src/types.rs b/poc-agent/src/types.rs index 60d6dd1..2cdc62c 100644 --- a/poc-agent/src/types.rs +++ b/poc-agent/src/types.rs @@ -129,6 +129,9 @@ pub struct ChatRequest { /// - reasoning.effort (documented: "none" disables entirely) #[serde(skip_serializing_if = "Option::is_none")] pub reasoning: Option, + /// vllm chat template kwargs — used to disable thinking on Qwen 3.5 + #[serde(skip_serializing_if = "Option::is_none")] + pub chat_template_kwargs: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/scripts/provision-vllm.sh b/scripts/provision-vllm.sh index e7b3a91..ee35670 100755 --- a/scripts/provision-vllm.sh +++ b/scripts/provision-vllm.sh @@ -6,15 +6,15 @@ # Or just scp this script and run it. # # Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB) -# Installs: vllm with Qwen 2.5 27B Instruct +# Installs: vllm with Qwen 3.5 27B # Exposes: OpenAI-compatible API on port 8000 set -euo pipefail -MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}" +MODEL="${MODEL:-Qwen/Qwen3.5-27B}" PORT="${PORT:-8000}" -MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" -GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}" +GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.95}" echo "=== vllm provisioning ===" echo "Model: $MODEL" @@ -24,7 +24,10 @@ echo "" # --- Install vllm --- echo "Installing vllm..." -pip install --upgrade vllm 2>&1 | tail -3 +pip install --upgrade vllm --break-system-packages 2>&1 | tail -3 + +# --- Use persistent storage --- +export HF_HOME=/workspace/huggingface # --- Verify GPU --- echo "" @@ -34,6 +37,7 @@ echo "" # --- Download model (cached in /root/.cache/huggingface) --- echo "Downloading model (this may take a while on first run)..." +pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5 echo "" @@ -47,7 +51,7 @@ exec vllm serve "$MODEL" \ --max-model-len "$MAX_MODEL_LEN" \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ --enable-prefix-caching \ - --tool-call-parser hermes \ + --tool-call-parser qwen3_xml \ --enable-auto-tool-choice \ - --disable-log-requests \ + --reasoning-parser=qwen3 \ --uvicorn-log-level warning