Add vllm provisioning script for RunPod GPU instances

Sets up vllm with Qwen 2.5 27B Instruct, prefix caching enabled, Hermes tool call parser for function calling support. Configurable via environment variables (MODEL, PORT, MAX_MODEL_LEN). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 23:13:04 -04:00 · 2026-03-18 23:13:04 -04:00 · 49ccdf87e1
commit 49ccdf87e1
parent b04a98c6e5
1 changed files with 53 additions and 0 deletions
--- a/scripts/provision-vllm.sh
+++ b/scripts/provision-vllm.sh
@ -0,0 +1,53 @@
 #!/bin/bash
 # provision-vllm.sh — Set up vllm on a RunPod GPU instance
 #
 # Usage: ssh into your RunPod instance and run:
 #   curl -sSL https://raw.githubusercontent.com/... | bash
 # Or just scp this script and run it.
 #
 # Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
 # Installs: vllm with Qwen 2.5 27B Instruct
 # Exposes: OpenAI-compatible API on port 8000
 set -euo pipefail
 MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
 PORT="${PORT:-8000}"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
 GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
 echo "=== vllm provisioning ==="
 echo "Model: $MODEL"
 echo "Port:  $PORT"
 echo "Max context: $MAX_MODEL_LEN"
 echo ""
 # --- Install vllm ---
 echo "Installing vllm..."
 pip install --upgrade vllm 2>&1 | tail -3
 # --- Verify GPU ---
 echo ""
 echo "GPU status:"
 nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
 echo ""
 # --- Download model (cached in /root/.cache/huggingface) ---
 echo "Downloading model (this may take a while on first run)..."
 python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
 echo ""
 # --- Launch vllm ---
 echo "Starting vllm server on port $PORT..."
 echo "API will be available at http://0.0.0.0:$PORT/v1"
 echo ""
 exec vllm serve "$MODEL" \
    --port "$PORT" \
    --max-model-len "$MAX_MODEL_LEN" \
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
    --enable-prefix-caching \
    --tool-call-parser hermes \
    --enable-auto-tool-choice \
    --disable-log-requests \
    --uvicorn-log-level warning