From 49ccdf87e111172a28d7cea291bd5ceaf78c3eab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 18 Mar 2026 23:13:04 -0400
Subject: [PATCH] Add vllm provisioning script for RunPod GPU instances

Sets up vllm with Qwen 2.5 27B Instruct, prefix caching enabled,
Hermes tool call parser for function calling support. Configurable
via environment variables (MODEL, PORT, MAX_MODEL_LEN).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/provision-vllm.sh | 53 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100755 scripts/provision-vllm.sh

diff --git a/scripts/provision-vllm.sh b/scripts/provision-vllm.sh
new file mode 100755
index 0000000..e7b3a91
--- /dev/null
+++ b/scripts/provision-vllm.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# provision-vllm.sh — Set up vllm on a RunPod GPU instance
+#
+# Usage: ssh into your RunPod instance and run:
+#   curl -sSL https://raw.githubusercontent.com/... | bash
+# Or just scp this script and run it.
+#
+# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
+# Installs: vllm with Qwen 2.5 27B Instruct
+# Exposes: OpenAI-compatible API on port 8000
+
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
+PORT="${PORT:-8000}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
+
+echo "=== vllm provisioning ==="
+echo "Model: $MODEL"
+echo "Port:  $PORT"
+echo "Max context: $MAX_MODEL_LEN"
+echo ""
+
+# --- Install vllm ---
+echo "Installing vllm..."
+pip install --upgrade vllm 2>&1 | tail -3
+
+# --- Verify GPU ---
+echo ""
+echo "GPU status:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
+echo ""
+
+# --- Download model (cached in /root/.cache/huggingface) ---
+echo "Downloading model (this may take a while on first run)..."
+python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
+echo ""
+
+# --- Launch vllm ---
+echo "Starting vllm server on port $PORT..."
+echo "API will be available at http://0.0.0.0:$PORT/v1"
+echo ""
+
+exec vllm serve "$MODEL" \
+    --port "$PORT" \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
+    --enable-prefix-caching \
+    --tool-call-parser hermes \
+    --enable-auto-tool-choice \
+    --disable-log-requests \
+    --uvicorn-log-level warning