The cursor index is into self.input, but the rendered buffer contains the prompt prepended to the first line. Need to add prompt.len() to get the correct character position when scanning the buffer.
57 lines
1.7 KiB
Bash
Executable file
57 lines
1.7 KiB
Bash
Executable file
#!/bin/bash
|
|
# provision-vllm.sh — Set up vllm on a RunPod GPU instance
|
|
#
|
|
# Usage: ssh into your RunPod instance and run:
|
|
# curl -sSL https://raw.githubusercontent.com/... | bash
|
|
# Or just scp this script and run it.
|
|
#
|
|
# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
|
|
# Installs: vllm with Qwen 3.5 27B
|
|
# Exposes: OpenAI-compatible API on port 8000
|
|
|
|
set -euo pipefail
|
|
|
|
MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
|
|
PORT="${PORT:-8000}"
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}"
|
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.95}"
|
|
|
|
echo "=== vllm provisioning ==="
|
|
echo "Model: $MODEL"
|
|
echo "Port: $PORT"
|
|
echo "Max context: $MAX_MODEL_LEN"
|
|
echo ""
|
|
|
|
# --- Install vllm ---
|
|
echo "Installing vllm..."
|
|
pip install --upgrade vllm --break-system-packages 2>&1 | tail -3
|
|
|
|
# --- Use persistent storage ---
|
|
export HF_HOME=/workspace/huggingface
|
|
|
|
# --- Verify GPU ---
|
|
echo ""
|
|
echo "GPU status:"
|
|
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
|
|
echo ""
|
|
|
|
# --- Download model (cached in /root/.cache/huggingface) ---
|
|
echo "Downloading model (this may take a while on first run)..."
|
|
pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
|
|
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
|
|
echo ""
|
|
|
|
# --- Launch vllm ---
|
|
echo "Starting vllm server on port $PORT..."
|
|
echo "API will be available at http://0.0.0.0:$PORT/v1"
|
|
echo ""
|
|
|
|
exec vllm serve "$MODEL" \
|
|
--port "$PORT" \
|
|
--max-model-len "$MAX_MODEL_LEN" \
|
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
|
--enable-prefix-caching \
|
|
--tool-call-parser hermes \
|
|
--enable-auto-tool-choice \
|
|
--reasoning-parser=qwen3 \
|
|
--uvicorn-log-level warning
|