Add vllm provisioning script for RunPod GPU instances
Sets up vllm with Qwen 2.5 27B Instruct, prefix caching enabled, Hermes tool call parser for function calling support. Configurable via environment variables (MODEL, PORT, MAX_MODEL_LEN). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b04a98c6e5
commit
49ccdf87e1
1 changed files with 53 additions and 0 deletions
53
scripts/provision-vllm.sh
Executable file
53
scripts/provision-vllm.sh
Executable file
|
|
@ -0,0 +1,53 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# provision-vllm.sh — Set up vllm on a RunPod GPU instance
|
||||||
|
#
|
||||||
|
# Usage: ssh into your RunPod instance and run:
|
||||||
|
# curl -sSL https://raw.githubusercontent.com/... | bash
|
||||||
|
# Or just scp this script and run it.
|
||||||
|
#
|
||||||
|
# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
|
||||||
|
# Installs: vllm with Qwen 2.5 27B Instruct
|
||||||
|
# Exposes: OpenAI-compatible API on port 8000
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
|
||||||
|
PORT="${PORT:-8000}"
|
||||||
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
||||||
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
|
||||||
|
|
||||||
|
echo "=== vllm provisioning ==="
|
||||||
|
echo "Model: $MODEL"
|
||||||
|
echo "Port: $PORT"
|
||||||
|
echo "Max context: $MAX_MODEL_LEN"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Install vllm ---
|
||||||
|
echo "Installing vllm..."
|
||||||
|
pip install --upgrade vllm 2>&1 | tail -3
|
||||||
|
|
||||||
|
# --- Verify GPU ---
|
||||||
|
echo ""
|
||||||
|
echo "GPU status:"
|
||||||
|
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Download model (cached in /root/.cache/huggingface) ---
|
||||||
|
echo "Downloading model (this may take a while on first run)..."
|
||||||
|
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# --- Launch vllm ---
|
||||||
|
echo "Starting vllm server on port $PORT..."
|
||||||
|
echo "API will be available at http://0.0.0.0:$PORT/v1"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
exec vllm serve "$MODEL" \
|
||||||
|
--port "$PORT" \
|
||||||
|
--max-model-len "$MAX_MODEL_LEN" \
|
||||||
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--tool-call-parser hermes \
|
||||||
|
--enable-auto-tool-choice \
|
||||||
|
--disable-log-requests \
|
||||||
|
--uvicorn-log-level warning
|
||||||
Loading…
Add table
Add a link
Reference in a new issue