Fix poc-agent for vllm/Qwen 3.5: reasoning display, tool parser

- Always display reasoning tokens regardless of reasoning_effort setting — Qwen 3.5 thinks natively and the reasoning parser separates it into its own field - Remove chat_template_kwargs that disabled thinking when reasoning_effort was "none" - Add chat_template_kwargs field to ChatRequest for vllm compat - Update provision script: qwen3_xml tool parser, qwen3 reasoning parser, 262K context, 95% GPU memory utilization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 00:06:26 -04:00 · 2026-03-19 00:06:26 -04:00 · f83325b44d
commit f83325b44d
parent 49ccdf87e1
3 changed files with 18 additions and 10 deletions
--- a/scripts/provision-vllm.sh
+++ b/scripts/provision-vllm.sh
@ -6,15 +6,15 @@
 # Or just scp this script and run it.
 #
 # Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
-# Installs: vllm with Qwen 2.5 27B Instruct
+# Installs: vllm with Qwen 3.5 27B
 # Exposes: OpenAI-compatible API on port 8000

 set -euo pipefail

-MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
+MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
 PORT="${PORT:-8000}"
-MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
-GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.95}"

 echo "=== vllm provisioning ==="
 echo "Model: $MODEL"
@ -24,7 +24,10 @@ echo ""

 # --- Install vllm ---
 echo "Installing vllm..."
-pip install --upgrade vllm 2>&1 | tail -3
+pip install --upgrade vllm --break-system-packages 2>&1 | tail -3
+
+# --- Use persistent storage ---
+export HF_HOME=/workspace/huggingface

 # --- Verify GPU ---
 echo ""
@ -34,6 +37,7 @@ echo ""

 # --- Download model (cached in /root/.cache/huggingface) ---
 echo "Downloading model (this may take a while on first run)..."
+pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
 python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
 echo ""

@ -47,7 +51,7 @@ exec vllm serve "$MODEL" \
    --max-model-len "$MAX_MODEL_LEN" \
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
    --enable-prefix-caching \
-    --tool-call-parser hermes \
+    --tool-call-parser qwen3_xml \
    --enable-auto-tool-choice \
-    --disable-log-requests \
+    --reasoning-parser=qwen3 \
    --uvicorn-log-level warning