#!/bin/bash # provision-mi300x.sh — Set up vllm on an MI300X GPU instance (ROCm) # # Usage: ssh into your instance and run this script. # # Expects: AMD MI300X GPU with ROCm drivers # Installs: vllm (ROCm wheels) with Qwen 3.5 27B # Exposes: OpenAI-compatible API on port 8000 # # Key differences from B200/CUDA setup: # - ROCm wheels from wheels.vllm.ai/rocm # - AITER attention backends (2.7-4.4x speedup) # - Reduced cudagraph capture size (DeltaNet cache conflict) # - BF16 model + FP8 KV cache (FP8 weights can be slower on MI300X) set -euo pipefail MODEL="${MODEL:-Qwen/Qwen3.5-27B}" PORT="${PORT:-8000}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-131072}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" # Set FP8=1 to use FP8 model weights (for benchmarking vs BF16) FP8="${FP8:-0}" echo "=== MI300X vllm provisioning ===" echo "Model: $MODEL" echo "Port: $PORT" echo "Max context: $MAX_MODEL_LEN" echo "" # --- Check for ROCm --- if ! command -v rocm-smi &>/dev/null; then echo "ERROR: rocm-smi not found. Is ROCm installed?" exit 1 fi echo "GPU status:" rocm-smi --showproductname --showmeminfo vram 2>/dev/null || rocm-smi echo "" # --- Install vllm (ROCm wheels) --- echo "Installing vllm (ROCm)..." pip install --upgrade vllm \ --extra-index-url https://wheels.vllm.ai/rocm \ --break-system-packages 2>&1 | tail -5 # --- Use persistent storage if available --- if [ -d /workspace ]; then export HF_HOME=/workspace/huggingface echo "Using persistent storage: $HF_HOME" fi # --- Download model --- echo "" echo "Downloading model (this may take a while on first run)..." pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5 echo "" # --- Launch vllm --- echo "Starting vllm server on port $PORT..." echo "API will be available at http://0.0.0.0:$PORT/v1" echo "" # ROCm-specific environment variables export VLLM_ROCM_USE_AITER=1 # Enable optimized AITER attention backends export HIP_FORCE_DEV_KERNARG=1 # Kernel launch performance export TORCH_BLAS_PREFER_HIPBLASLT=1 # Better BLAS performance DTYPE_ARGS="--dtype bfloat16 --kv-cache-dtype fp8_e4m3" if [ "$FP8" = "1" ]; then DTYPE_ARGS="--dtype fp8_e4m3" echo "*** FP8 mode: model weights AND KV cache in FP8 ***" else echo "*** BF16 mode: model in BF16, KV cache in FP8 ***" fi exec vllm serve "$MODEL" \ --port "$PORT" \ $DTYPE_ARGS \ --max-model-len "$MAX_MODEL_LEN" \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ --enable-prefix-caching \ --enable-auto-tool-choice \ --tool-call-parser qwen35_coder \ --reasoning-parser qwen3 \ --trust-remote-code \ --max-cudagraph-capture-size 64 \ --uvicorn-log-level warning