consciousness/scripts/provision-vllm.sh

#!/bin/bash
# provision-vllm.sh — Set up vllm on a RunPod GPU instance
#
# Usage: ssh into your RunPod instance and run:
#   curl -sSL https://raw.githubusercontent.com/... | bash
# Or just scp this script and run it.
#
# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
# Installs: vllm with Qwen 2.5 27B Instruct
# Exposes: OpenAI-compatible API on port 8000

set -euo pipefail

MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
PORT="${PORT:-8000}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"

echo "=== vllm provisioning ==="
echo "Model: $MODEL"
echo "Port:  $PORT"
echo "Max context: $MAX_MODEL_LEN"
echo ""

# --- Install vllm ---
echo "Installing vllm..."
pip install --upgrade vllm 2>&1 | tail -3

# --- Verify GPU ---
echo ""
echo "GPU status:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
echo ""

# --- Download model (cached in /root/.cache/huggingface) ---
echo "Downloading model (this may take a while on first run)..."
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
echo ""

# --- Launch vllm ---
echo "Starting vllm server on port $PORT..."
echo "API will be available at http://0.0.0.0:$PORT/v1"
echo ""

exec vllm serve "$MODEL" \
    --port "$PORT" \
    --max-model-len "$MAX_MODEL_LEN" \
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
    --enable-prefix-caching \
    --tool-call-parser hermes \
    --enable-auto-tool-choice \
    --disable-log-requests \
    --uvicorn-log-level warning
Add vllm provisioning script for RunPod GPU instances Sets up vllm with Qwen 2.5 27B Instruct, prefix caching enabled, Hermes tool call parser for function calling support. Configurable via environment variables (MODEL, PORT, MAX_MODEL_LEN). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 23:13:04 -04:00			`#!/bin/bash`
			`# provision-vllm.sh — Set up vllm on a RunPod GPU instance`
			`#`
			`# Usage: ssh into your RunPod instance and run:`
			`# curl -sSL https://raw.githubusercontent.com/... \| bash`
			`# Or just scp this script and run it.`
			`#`
			`# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)`
			`# Installs: vllm with Qwen 2.5 27B Instruct`
			`# Exposes: OpenAI-compatible API on port 8000`

			`set -euo pipefail`

			`MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"`
			`PORT="${PORT:-8000}"`
			`MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"`
			`GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"`

			`echo "=== vllm provisioning ==="`
			`echo "Model: $MODEL"`
			`echo "Port: $PORT"`
			`echo "Max context: $MAX_MODEL_LEN"`
			`echo ""`

			`# --- Install vllm ---`
			`echo "Installing vllm..."`
			`pip install --upgrade vllm 2>&1 \| tail -3`

			`# --- Verify GPU ---`
			`echo ""`
			`echo "GPU status:"`
			`nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader`
			`echo ""`

			`# --- Download model (cached in /root/.cache/huggingface) ---`
			`echo "Downloading model (this may take a while on first run)..."`
			`python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 \| tail -5`
			`echo ""`

			`# --- Launch vllm ---`
			`echo "Starting vllm server on port $PORT..."`
			`echo "API will be available at http://0.0.0.0:$PORT/v1"`
			`echo ""`

			`exec vllm serve "$MODEL" \`
			`--port "$PORT" \`
			`--max-model-len "$MAX_MODEL_LEN" \`
			`--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \`
			`--enable-prefix-caching \`
			`--tool-call-parser hermes \`
			`--enable-auto-tool-choice \`
			`--disable-log-requests \`
			`--uvicorn-log-level warning`