#!/bin/bash # provision-vllm.sh — Set up vllm on a RunPod GPU instance # # Usage: ssh into your RunPod instance and run: # curl -sSL https://raw.githubusercontent.com/... | bash # Or just scp this script and run it. # # Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB) # Installs: vllm with Qwen 3.5 27B # Exposes: OpenAI-compatible API on port 8000 set -euo pipefail MODEL="${MODEL:-Qwen/Qwen3.5-27B}" PORT="${PORT:-8000}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.95}" echo "=== vllm provisioning ===" echo "Model: $MODEL" echo "Port: $PORT" echo "Max context: $MAX_MODEL_LEN" echo "" # --- Install vllm --- echo "Installing vllm..." pip install --upgrade vllm --break-system-packages 2>&1 | tail -3 # --- Use persistent storage --- export HF_HOME=/workspace/huggingface # --- Verify GPU --- echo "" echo "GPU status:" nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader echo "" # --- Download model (cached in /root/.cache/huggingface) --- echo "Downloading model (this may take a while on first run)..." pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5 echo "" # --- Launch vllm --- echo "Starting vllm server on port $PORT..." echo "API will be available at http://0.0.0.0:$PORT/v1" echo "" exec vllm serve "$MODEL" \ --port "$PORT" \ --max-model-len "$MAX_MODEL_LEN" \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ --enable-prefix-caching \ --tool-call-parser hermes \ --enable-auto-tool-choice \ --reasoning-parser=qwen3 \ --uvicorn-log-level warning