51 lines
1.3 KiB
Bash
51 lines
1.3 KiB
Bash
|
|
#!/bin/bash
|
||
|
|
# provision-mistralrs.sh — Set up mistral.rs on a RunPod GPU instance
|
||
|
|
#
|
||
|
|
# Alternative to vLLM for inference. Pure Rust, more debuggable,
|
||
|
|
# OpenAI-compatible API. Testing whether it fixes the IncompleteMessage
|
||
|
|
# errors we're seeing with vLLM on large payloads.
|
||
|
|
#
|
||
|
|
# Usage: ssh into your RunPod instance and run this script.
|
||
|
|
# Runs on port 8001 to coexist with vLLM on 8000.
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
|
||
|
|
PORT="${PORT:-8001}"
|
||
|
|
|
||
|
|
echo "=== mistral.rs provisioning ==="
|
||
|
|
echo "Model: $MODEL"
|
||
|
|
echo "Port: $PORT"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# --- Verify GPU ---
|
||
|
|
echo "GPU status:"
|
||
|
|
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# --- Install mistral.rs ---
|
||
|
|
echo "Installing mistral.rs..."
|
||
|
|
curl --proto '=https' --tlsv1.2 -sSf \
|
||
|
|
https://raw.githubusercontent.com/EricLBuehler/mistral.rs/master/install.sh | sh
|
||
|
|
|
||
|
|
# --- Use persistent storage for model cache ---
|
||
|
|
export HF_HOME="${HF_HOME:-/workspace/huggingface}"
|
||
|
|
mkdir -p "$HF_HOME"
|
||
|
|
|
||
|
|
# --- Run hardware tune first ---
|
||
|
|
echo "Running hardware benchmark..."
|
||
|
|
mistralrs tune
|
||
|
|
|
||
|
|
# --- Start server ---
|
||
|
|
echo ""
|
||
|
|
echo "Starting mistral.rs server on port $PORT..."
|
||
|
|
echo "API: http://0.0.0.0:$PORT/v1"
|
||
|
|
echo "UI: http://0.0.0.0:$PORT/ui"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# Run in foreground (use screen/tmux to background)
|
||
|
|
mistralrs serve \
|
||
|
|
--ui \
|
||
|
|
--port "$PORT" \
|
||
|
|
-m "$MODEL"
|