From 377e2773bc9c560277c78f759a200ebd9b506bb1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 19 Mar 2026 14:40:15 -0400 Subject: [PATCH] Add MI300X provisioning script for vllm/Qwen 3.5 27B ROCm-specific setup with: - AITER attention backends (VLLM_ROCM_USE_AITER=1) - Reduced cudagraph capture size (DeltaNet cache conflict) - BF16 model + FP8 KV cache as default (FP8 weights can be slower on MI300X due to ROCm kernel maturity) - FP8=1 flag for benchmarking FP8 model weights Key for training plan: if FP8 matmuls are slow on MI300X, the quantize-and-expand strategy needs B200 instead. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/provision-mi300x.sh | 89 +++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 scripts/provision-mi300x.sh diff --git a/scripts/provision-mi300x.sh b/scripts/provision-mi300x.sh new file mode 100755 index 0000000..5a47738 --- /dev/null +++ b/scripts/provision-mi300x.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# provision-mi300x.sh — Set up vllm on an MI300X GPU instance (ROCm) +# +# Usage: ssh into your instance and run this script. +# +# Expects: AMD MI300X GPU with ROCm drivers +# Installs: vllm (ROCm wheels) with Qwen 3.5 27B +# Exposes: OpenAI-compatible API on port 8000 +# +# Key differences from B200/CUDA setup: +# - ROCm wheels from wheels.vllm.ai/rocm +# - AITER attention backends (2.7-4.4x speedup) +# - Reduced cudagraph capture size (DeltaNet cache conflict) +# - BF16 model + FP8 KV cache (FP8 weights can be slower on MI300X) + +set -euo pipefail + +MODEL="${MODEL:-Qwen/Qwen3.5-27B}" +PORT="${PORT:-8000}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-131072}" +GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" +# Set FP8=1 to use FP8 model weights (for benchmarking vs BF16) +FP8="${FP8:-0}" + +echo "=== MI300X vllm provisioning ===" +echo "Model: $MODEL" +echo "Port: $PORT" +echo "Max context: $MAX_MODEL_LEN" +echo "" + +# --- Check for ROCm --- +if ! command -v rocm-smi &>/dev/null; then + echo "ERROR: rocm-smi not found. Is ROCm installed?" + exit 1 +fi + +echo "GPU status:" +rocm-smi --showproductname --showmeminfo vram 2>/dev/null || rocm-smi +echo "" + +# --- Install vllm (ROCm wheels) --- +echo "Installing vllm (ROCm)..." +pip install --upgrade vllm \ + --extra-index-url https://wheels.vllm.ai/rocm \ + --break-system-packages 2>&1 | tail -5 + +# --- Use persistent storage if available --- +if [ -d /workspace ]; then + export HF_HOME=/workspace/huggingface + echo "Using persistent storage: $HF_HOME" +fi + +# --- Download model --- +echo "" +echo "Downloading model (this may take a while on first run)..." +pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null +python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5 +echo "" + +# --- Launch vllm --- +echo "Starting vllm server on port $PORT..." +echo "API will be available at http://0.0.0.0:$PORT/v1" +echo "" + +# ROCm-specific environment variables +export VLLM_ROCM_USE_AITER=1 # Enable optimized AITER attention backends +export HIP_FORCE_DEV_KERNARG=1 # Kernel launch performance +export TORCH_BLAS_PREFER_HIPBLASLT=1 # Better BLAS performance + +DTYPE_ARGS="--dtype bfloat16 --kv-cache-dtype fp8_e4m3" +if [ "$FP8" = "1" ]; then + DTYPE_ARGS="--dtype fp8_e4m3" + echo "*** FP8 mode: model weights AND KV cache in FP8 ***" +else + echo "*** BF16 mode: model in BF16, KV cache in FP8 ***" +fi + +exec vllm serve "$MODEL" \ + --port "$PORT" \ + $DTYPE_ARGS \ + --max-model-len "$MAX_MODEL_LEN" \ + --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ + --enable-prefix-caching \ + --tool-call-parser hermes \ + --enable-auto-tool-choice \ + --reasoning-parser qwen3 \ + --trust-remote-code \ + --max-cudagraph-capture-size 64 \ + --uvicorn-log-level warning