consciousness/scripts/provision-mistralrs.sh

#!/bin/bash
# provision-mistralrs.sh — Set up mistral.rs on a RunPod GPU instance
#
# Alternative to vLLM for inference. Pure Rust, more debuggable,
# OpenAI-compatible API. Testing whether it fixes the IncompleteMessage
# errors we're seeing with vLLM on large payloads.
#
# Usage: ssh into your RunPod instance and run this script.
# Runs on port 8001 to coexist with vLLM on 8000.

set -euo pipefail

MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
PORT="${PORT:-8001}"

echo "=== mistral.rs provisioning ==="
echo "Model: $MODEL"
echo "Port:  $PORT"
echo ""

# --- Verify GPU ---
echo "GPU status:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader
echo ""

# --- Install mistral.rs ---
echo "Installing mistral.rs..."
curl --proto '=https' --tlsv1.2 -sSf \
    https://raw.githubusercontent.com/EricLBuehler/mistral.rs/master/install.sh | sh

# --- Use persistent storage for model cache ---
export HF_HOME="${HF_HOME:-/workspace/huggingface}"
mkdir -p "$HF_HOME"

# --- Run hardware tune first ---
echo "Running hardware benchmark..."
mistralrs tune

# --- Start server ---
echo ""
echo "Starting mistral.rs server on port $PORT..."
echo "API: http://0.0.0.0:$PORT/v1"
echo "UI:  http://0.0.0.0:$PORT/ui"
echo ""

# Run in foreground (use screen/tmux to background)
mistralrs serve \
    --ui \
    --port "$PORT" \
    -m "$MODEL"
poc-agent: read context_groups from config instead of hardcoded list - Remove MEMORY_FILES constant from identity.rs - Add ContextGroup struct for deserializing from config - Load context_groups from ~/.config/poc-agent/config.json5 - Check ~/.config/poc-agent/ first for identity files, then project/global - Debug screen now shows what's actually configured This eliminates the hardcoded duplication and makes the debug output match what's in the config file. 2026-03-24 01:53:28 -04:00			`#!/bin/bash`
			`# provision-mistralrs.sh — Set up mistral.rs on a RunPod GPU instance`
			`#`
			`# Alternative to vLLM for inference. Pure Rust, more debuggable,`
			`# OpenAI-compatible API. Testing whether it fixes the IncompleteMessage`
			`# errors we're seeing with vLLM on large payloads.`
			`#`
			`# Usage: ssh into your RunPod instance and run this script.`
			`# Runs on port 8001 to coexist with vLLM on 8000.`

			`set -euo pipefail`

			`MODEL="${MODEL:-Qwen/Qwen3.5-27B}"`
			`PORT="${PORT:-8001}"`

			`echo "=== mistral.rs provisioning ==="`
			`echo "Model: $MODEL"`
			`echo "Port: $PORT"`
			`echo ""`

			`# --- Verify GPU ---`
			`echo "GPU status:"`
			`nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader`
			`echo ""`

			`# --- Install mistral.rs ---`
			`echo "Installing mistral.rs..."`
			`curl --proto '=https' --tlsv1.2 -sSf \`
			`https://raw.githubusercontent.com/EricLBuehler/mistral.rs/master/install.sh \| sh`

			`# --- Use persistent storage for model cache ---`
			`export HF_HOME="${HF_HOME:-/workspace/huggingface}"`
			`mkdir -p "$HF_HOME"`

			`# --- Run hardware tune first ---`
			`echo "Running hardware benchmark..."`
			`mistralrs tune`

			`# --- Start server ---`
			`echo ""`
			`echo "Starting mistral.rs server on port $PORT..."`
			`echo "API: http://0.0.0.0:$PORT/v1"`
			`echo "UI: http://0.0.0.0:$PORT/ui"`
			`echo ""`

			`# Run in foreground (use screen/tmux to background)`
			`mistralrs serve \`
			`--ui \`
			`--port "$PORT" \`
			`-m "$MODEL"`