consciousness/scripts/provision-mistralrs.sh

51 lines
1.3 KiB
Bash
Raw Permalink Normal View History

#!/bin/bash
# provision-mistralrs.sh — Set up mistral.rs on a RunPod GPU instance
#
# Alternative to vLLM for inference. Pure Rust, more debuggable,
# OpenAI-compatible API. Testing whether it fixes the IncompleteMessage
# errors we're seeing with vLLM on large payloads.
#
# Usage: ssh into your RunPod instance and run this script.
# Runs on port 8001 to coexist with vLLM on 8000.
set -euo pipefail
MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
PORT="${PORT:-8001}"
echo "=== mistral.rs provisioning ==="
echo "Model: $MODEL"
echo "Port: $PORT"
echo ""
# --- Verify GPU ---
echo "GPU status:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
echo ""
# --- Install mistral.rs ---
echo "Installing mistral.rs..."
curl --proto '=https' --tlsv1.2 -sSf \
https://raw.githubusercontent.com/EricLBuehler/mistral.rs/master/install.sh | sh
# --- Use persistent storage for model cache ---
export HF_HOME="${HF_HOME:-/workspace/huggingface}"
mkdir -p "$HF_HOME"
# --- Run hardware tune first ---
echo "Running hardware benchmark..."
mistralrs tune
# --- Start server ---
echo ""
echo "Starting mistral.rs server on port $PORT..."
echo "API: http://0.0.0.0:$PORT/v1"
echo "UI: http://0.0.0.0:$PORT/ui"
echo ""
# Run in foreground (use screen/tmux to background)
mistralrs serve \
--ui \
--port "$PORT" \
-m "$MODEL"