#!/bin/bash
# provision-mistralrs.sh — Set up mistral.rs on a RunPod GPU instance
#
# Alternative to vLLM for inference. Pure Rust, more debuggable,
# OpenAI-compatible API. Testing whether it fixes the IncompleteMessage
# errors we're seeing with vLLM on large payloads.
#
# Usage: ssh into your RunPod instance and run this script.
# Runs on port 8001 to coexist with vLLM on 8000.

set -euo pipefail

MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
PORT="${PORT:-8001}"

echo "=== mistral.rs provisioning ==="
echo "Model: $MODEL"
echo "Port:  $PORT"
echo ""

# --- Verify GPU ---
echo "GPU status:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
echo ""

# --- Install mistral.rs ---
echo "Installing mistral.rs..."
curl --proto '=https' --tlsv1.2 -sSf \
    https://raw.githubusercontent.com/EricLBuehler/mistral.rs/master/install.sh | sh

# --- Use persistent storage for model cache ---
export HF_HOME="${HF_HOME:-/workspace/huggingface}"
mkdir -p "$HF_HOME"

# --- Run hardware tune first ---
echo "Running hardware benchmark..."
mistralrs tune

# --- Start server ---
echo ""
echo "Starting mistral.rs server on port $PORT..."
echo "API: http://0.0.0.0:$PORT/v1"
echo "UI:  http://0.0.0.0:$PORT/ui"
echo ""

# Run in foreground (use screen/tmux to background)
mistralrs serve \
    --ui \
    --port "$PORT" \
    -m "$MODEL"