90 lines
2.8 KiB
Bash
90 lines
2.8 KiB
Bash
|
|
#!/bin/bash
|
||
|
|
# provision-mi300x.sh — Set up vllm on an MI300X GPU instance (ROCm)
|
||
|
|
#
|
||
|
|
# Usage: ssh into your instance and run this script.
|
||
|
|
#
|
||
|
|
# Expects: AMD MI300X GPU with ROCm drivers
|
||
|
|
# Installs: vllm (ROCm wheels) with Qwen 3.5 27B
|
||
|
|
# Exposes: OpenAI-compatible API on port 8000
|
||
|
|
#
|
||
|
|
# Key differences from B200/CUDA setup:
|
||
|
|
# - ROCm wheels from wheels.vllm.ai/rocm
|
||
|
|
# - AITER attention backends (2.7-4.4x speedup)
|
||
|
|
# - Reduced cudagraph capture size (DeltaNet cache conflict)
|
||
|
|
# - BF16 model + FP8 KV cache (FP8 weights can be slower on MI300X)
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
|
||
|
|
PORT="${PORT:-8000}"
|
||
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-131072}"
|
||
|
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
|
||
|
|
# Set FP8=1 to use FP8 model weights (for benchmarking vs BF16)
|
||
|
|
FP8="${FP8:-0}"
|
||
|
|
|
||
|
|
echo "=== MI300X vllm provisioning ==="
|
||
|
|
echo "Model: $MODEL"
|
||
|
|
echo "Port: $PORT"
|
||
|
|
echo "Max context: $MAX_MODEL_LEN"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# --- Check for ROCm ---
|
||
|
|
if ! command -v rocm-smi &>/dev/null; then
|
||
|
|
echo "ERROR: rocm-smi not found. Is ROCm installed?"
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "GPU status:"
|
||
|
|
rocm-smi --showproductname --showmeminfo vram 2>/dev/null || rocm-smi
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# --- Install vllm (ROCm wheels) ---
|
||
|
|
echo "Installing vllm (ROCm)..."
|
||
|
|
pip install --upgrade vllm \
|
||
|
|
--extra-index-url https://wheels.vllm.ai/rocm \
|
||
|
|
--break-system-packages 2>&1 | tail -5
|
||
|
|
|
||
|
|
# --- Use persistent storage if available ---
|
||
|
|
if [ -d /workspace ]; then
|
||
|
|
export HF_HOME=/workspace/huggingface
|
||
|
|
echo "Using persistent storage: $HF_HOME"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --- Download model ---
|
||
|
|
echo ""
|
||
|
|
echo "Downloading model (this may take a while on first run)..."
|
||
|
|
pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
|
||
|
|
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# --- Launch vllm ---
|
||
|
|
echo "Starting vllm server on port $PORT..."
|
||
|
|
echo "API will be available at http://0.0.0.0:$PORT/v1"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# ROCm-specific environment variables
|
||
|
|
export VLLM_ROCM_USE_AITER=1 # Enable optimized AITER attention backends
|
||
|
|
export HIP_FORCE_DEV_KERNARG=1 # Kernel launch performance
|
||
|
|
export TORCH_BLAS_PREFER_HIPBLASLT=1 # Better BLAS performance
|
||
|
|
|
||
|
|
DTYPE_ARGS="--dtype bfloat16 --kv-cache-dtype fp8_e4m3"
|
||
|
|
if [ "$FP8" = "1" ]; then
|
||
|
|
DTYPE_ARGS="--dtype fp8_e4m3"
|
||
|
|
echo "*** FP8 mode: model weights AND KV cache in FP8 ***"
|
||
|
|
else
|
||
|
|
echo "*** BF16 mode: model in BF16, KV cache in FP8 ***"
|
||
|
|
fi
|
||
|
|
|
||
|
|
exec vllm serve "$MODEL" \
|
||
|
|
--port "$PORT" \
|
||
|
|
$DTYPE_ARGS \
|
||
|
|
--max-model-len "$MAX_MODEL_LEN" \
|
||
|
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
||
|
|
--enable-prefix-caching \
|
||
|
|
--tool-call-parser hermes \
|
||
|
|
--enable-auto-tool-choice \
|
||
|
|
--reasoning-parser qwen3 \
|
||
|
|
--trust-remote-code \
|
||
|
|
--max-cudagraph-capture-size 64 \
|
||
|
|
--uvicorn-log-level warning
|