#!/bin/bash # provision-mistralrs.sh — Set up mistral.rs on a RunPod GPU instance # # Alternative to vLLM for inference. Pure Rust, more debuggable, # OpenAI-compatible API. Testing whether it fixes the IncompleteMessage # errors we're seeing with vLLM on large payloads. # # Usage: ssh into your RunPod instance and run this script. # Runs on port 8001 to coexist with vLLM on 8000. set -euo pipefail MODEL="${MODEL:-Qwen/Qwen3.5-27B}" PORT="${PORT:-8001}" echo "=== mistral.rs provisioning ===" echo "Model: $MODEL" echo "Port: $PORT" echo "" # --- Verify GPU --- echo "GPU status:" nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader echo "" # --- Install mistral.rs --- echo "Installing mistral.rs..." curl --proto '=https' --tlsv1.2 -sSf \ https://raw.githubusercontent.com/EricLBuehler/mistral.rs/master/install.sh | sh # --- Use persistent storage for model cache --- export HF_HOME="${HF_HOME:-/workspace/huggingface}" mkdir -p "$HF_HOME" # --- Run hardware tune first --- echo "Running hardware benchmark..." mistralrs tune # --- Start server --- echo "" echo "Starting mistral.rs server on port $PORT..." echo "API: http://0.0.0.0:$PORT/v1" echo "UI: http://0.0.0.0:$PORT/ui" echo "" # Run in foreground (use screen/tmux to background) mistralrs serve \ --ui \ --port "$PORT" \ -m "$MODEL"