Sets up vllm with Qwen 2.5 27B Instruct, prefix caching enabled, Hermes tool call parser for function calling support. Configurable via environment variables (MODEL, PORT, MAX_MODEL_LEN). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
53 lines
1.5 KiB
Bash
Executable file
53 lines
1.5 KiB
Bash
Executable file
#!/bin/bash
|
|
# provision-vllm.sh — Set up vllm on a RunPod GPU instance
|
|
#
|
|
# Usage: ssh into your RunPod instance and run:
|
|
# curl -sSL https://raw.githubusercontent.com/... | bash
|
|
# Or just scp this script and run it.
|
|
#
|
|
# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
|
|
# Installs: vllm with Qwen 2.5 27B Instruct
|
|
# Exposes: OpenAI-compatible API on port 8000
|
|
|
|
set -euo pipefail
|
|
|
|
MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
|
|
PORT="${PORT:-8000}"
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
|
|
|
|
echo "=== vllm provisioning ==="
|
|
echo "Model: $MODEL"
|
|
echo "Port: $PORT"
|
|
echo "Max context: $MAX_MODEL_LEN"
|
|
echo ""
|
|
|
|
# --- Install vllm ---
|
|
echo "Installing vllm..."
|
|
pip install --upgrade vllm 2>&1 | tail -3
|
|
|
|
# --- Verify GPU ---
|
|
echo ""
|
|
echo "GPU status:"
|
|
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
|
|
echo ""
|
|
|
|
# --- Download model (cached in /root/.cache/huggingface) ---
|
|
echo "Downloading model (this may take a while on first run)..."
|
|
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
|
|
echo ""
|
|
|
|
# --- Launch vllm ---
|
|
echo "Starting vllm server on port $PORT..."
|
|
echo "API will be available at http://0.0.0.0:$PORT/v1"
|
|
echo ""
|
|
|
|
exec vllm serve "$MODEL" \
|
|
--port "$PORT" \
|
|
--max-model-len "$MAX_MODEL_LEN" \
|
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
|
--enable-prefix-caching \
|
|
--tool-call-parser hermes \
|
|
--enable-auto-tool-choice \
|
|
--disable-log-requests \
|
|
--uvicorn-log-level warning
|