diff --git a/scripts/provision-vllm.sh b/scripts/provision-vllm.sh new file mode 100755 index 0000000..e7b3a91 --- /dev/null +++ b/scripts/provision-vllm.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# provision-vllm.sh — Set up vllm on a RunPod GPU instance +# +# Usage: ssh into your RunPod instance and run: +# curl -sSL https://raw.githubusercontent.com/... | bash +# Or just scp this script and run it. +# +# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB) +# Installs: vllm with Qwen 2.5 27B Instruct +# Exposes: OpenAI-compatible API on port 8000 + +set -euo pipefail + +MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}" +PORT="${PORT:-8000}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" +GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" + +echo "=== vllm provisioning ===" +echo "Model: $MODEL" +echo "Port: $PORT" +echo "Max context: $MAX_MODEL_LEN" +echo "" + +# --- Install vllm --- +echo "Installing vllm..." +pip install --upgrade vllm 2>&1 | tail -3 + +# --- Verify GPU --- +echo "" +echo "GPU status:" +nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader +echo "" + +# --- Download model (cached in /root/.cache/huggingface) --- +echo "Downloading model (this may take a while on first run)..." +python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5 +echo "" + +# --- Launch vllm --- +echo "Starting vllm server on port $PORT..." +echo "API will be available at http://0.0.0.0:$PORT/v1" +echo "" + +exec vllm serve "$MODEL" \ + --port "$PORT" \ + --max-model-len "$MAX_MODEL_LEN" \ + --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ + --enable-prefix-caching \ + --tool-call-parser hermes \ + --enable-auto-tool-choice \ + --disable-log-requests \ + --uvicorn-log-level warning