Add MI300X provisioning script for vllm/Qwen 3.5 27B

ROCm-specific setup with: - AITER attention backends (VLLM_ROCM_USE_AITER=1) - Reduced cudagraph capture size (DeltaNet cache conflict) - BF16 model + FP8 KV cache as default (FP8 weights can be slower on MI300X due to ROCm kernel maturity) - FP8=1 flag for benchmarking FP8 model weights Key for training plan: if FP8 matmuls are slow on MI300X, the quantize-and-expand strategy needs B200 instead. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 14:40:15 -04:00 · 2026-03-19 14:40:15 -04:00 · 377e2773bc
commit 377e2773bc
parent af3171d6ec
1 changed files with 89 additions and 0 deletions
--- a/scripts/provision-mi300x.sh
+++ b/scripts/provision-mi300x.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+# provision-mi300x.sh — Set up vllm on an MI300X GPU instance (ROCm)
+#
+# Usage: ssh into your instance and run this script.
+#
+# Expects: AMD MI300X GPU with ROCm drivers
+# Installs: vllm (ROCm wheels) with Qwen 3.5 27B
+# Exposes: OpenAI-compatible API on port 8000
+#
+# Key differences from B200/CUDA setup:
+#   - ROCm wheels from wheels.vllm.ai/rocm
+#   - AITER attention backends (2.7-4.4x speedup)
+#   - Reduced cudagraph capture size (DeltaNet cache conflict)
+#   - BF16 model + FP8 KV cache (FP8 weights can be slower on MI300X)
+
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
+PORT="${PORT:-8000}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-131072}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
+# Set FP8=1 to use FP8 model weights (for benchmarking vs BF16)
+FP8="${FP8:-0}"
+
+echo "=== MI300X vllm provisioning ==="
+echo "Model: $MODEL"
+echo "Port:  $PORT"
+echo "Max context: $MAX_MODEL_LEN"
+echo ""
+
+# --- Check for ROCm ---
+if ! command -v rocm-smi &>/dev/null; then
+    echo "ERROR: rocm-smi not found. Is ROCm installed?"
+    exit 1
+fi
+
+echo "GPU status:"
+rocm-smi --showproductname --showmeminfo vram 2>/dev/null || rocm-smi
+echo ""
+
+# --- Install vllm (ROCm wheels) ---
+echo "Installing vllm (ROCm)..."
+pip install --upgrade vllm \
+    --extra-index-url https://wheels.vllm.ai/rocm \
+    --break-system-packages 2>&1 | tail -5
+
+# --- Use persistent storage if available ---
+if [ -d /workspace ]; then
+    export HF_HOME=/workspace/huggingface
+    echo "Using persistent storage: $HF_HOME"
+fi
+
+# --- Download model ---
+echo ""
+echo "Downloading model (this may take a while on first run)..."
+pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
+python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
+echo ""
+
+# --- Launch vllm ---
+echo "Starting vllm server on port $PORT..."
+echo "API will be available at http://0.0.0.0:$PORT/v1"
+echo ""
+
+# ROCm-specific environment variables
+export VLLM_ROCM_USE_AITER=1           # Enable optimized AITER attention backends
+export HIP_FORCE_DEV_KERNARG=1          # Kernel launch performance
+export TORCH_BLAS_PREFER_HIPBLASLT=1    # Better BLAS performance
+
+DTYPE_ARGS="--dtype bfloat16 --kv-cache-dtype fp8_e4m3"
+if [ "$FP8" = "1" ]; then
+    DTYPE_ARGS="--dtype fp8_e4m3"
+    echo "*** FP8 mode: model weights AND KV cache in FP8 ***"
+else
+    echo "*** BF16 mode: model in BF16, KV cache in FP8 ***"
+fi
+
+exec vllm serve "$MODEL" \
+    --port "$PORT" \
+    $DTYPE_ARGS \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
+    --enable-prefix-caching \
+    --tool-call-parser hermes \
+    --enable-auto-tool-choice \
+    --reasoning-parser qwen3 \
+    --trust-remote-code \
+    --max-cudagraph-capture-size 64 \
+    --uvicorn-log-level warning