diff --git a/scripts/provision-vllm.sh b/scripts/provision-vllm.sh
new file mode 100755
index 0000000..e7b3a91
--- /dev/null
+++ b/scripts/provision-vllm.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# provision-vllm.sh — Set up vllm on a RunPod GPU instance
+#
+# Usage: ssh into your RunPod instance and run:
+#   curl -sSL https://raw.githubusercontent.com/... | bash
+# Or just scp this script and run it.
+#
+# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
+# Installs: vllm with Qwen 2.5 27B Instruct
+# Exposes: OpenAI-compatible API on port 8000
+
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
+PORT="${PORT:-8000}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
+
+echo "=== vllm provisioning ==="
+echo "Model: $MODEL"
+echo "Port:  $PORT"
+echo "Max context: $MAX_MODEL_LEN"
+echo ""
+
+# --- Install vllm ---
+echo "Installing vllm..."
+pip install --upgrade vllm 2>&1 | tail -3
+
+# --- Verify GPU ---
+echo ""
+echo "GPU status:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
+echo ""
+
+# --- Download model (cached in /root/.cache/huggingface) ---
+echo "Downloading model (this may take a while on first run)..."
+python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
+echo ""
+
+# --- Launch vllm ---
+echo "Starting vllm server on port $PORT..."
+echo "API will be available at http://0.0.0.0:$PORT/v1"
+echo ""
+
+exec vllm serve "$MODEL" \
+    --port "$PORT" \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
+    --enable-prefix-caching \
+    --tool-call-parser hermes \
+    --enable-auto-tool-choice \
+    --disable-log-requests \
+    --uvicorn-log-level warning