consciousness/scripts/provision-mi300x.sh

#!/bin/bash
# provision-mi300x.sh — Set up vllm on an MI300X GPU instance (ROCm)
#
# Usage: ssh into your instance and run this script.
#
# Expects: AMD MI300X GPU with ROCm drivers
# Installs: vllm (ROCm wheels) with Qwen 3.5 27B
# Exposes: OpenAI-compatible API on port 8000
#
# Key differences from B200/CUDA setup:
#   - ROCm wheels from wheels.vllm.ai/rocm
#   - AITER attention backends (2.7-4.4x speedup)
#   - Reduced cudagraph capture size (DeltaNet cache conflict)
#   - BF16 model + FP8 KV cache (FP8 weights can be slower on MI300X)

set -euo pipefail

MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
PORT="${PORT:-8000}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-131072}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
# Set FP8=1 to use FP8 model weights (for benchmarking vs BF16)
FP8="${FP8:-0}"

echo "=== MI300X vllm provisioning ==="
echo "Model: $MODEL"
echo "Port:  $PORT"
echo "Max context: $MAX_MODEL_LEN"
echo ""

# --- Check for ROCm ---
if ! command -v rocm-smi &>/dev/null; then
    echo "ERROR: rocm-smi not found. Is ROCm installed?"
    exit 1
fi

echo "GPU status:"
rocm-smi --showproductname --showmeminfo vram 2>/dev/null || rocm-smi
echo ""

# --- Install vllm (ROCm wheels) ---
echo "Installing vllm (ROCm)..."
pip install --upgrade vllm \
    --extra-index-url https://wheels.vllm.ai/rocm \
    --break-system-packages 2>&1 | tail -5

# --- Use persistent storage if available ---
if [ -d /workspace ]; then
    export HF_HOME=/workspace/huggingface
    echo "Using persistent storage: $HF_HOME"
fi

# --- Download model ---
echo ""
echo "Downloading model (this may take a while on first run)..."
pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
echo ""

# --- Launch vllm ---
echo "Starting vllm server on port $PORT..."
echo "API will be available at http://0.0.0.0:$PORT/v1"
echo ""

# ROCm-specific environment variables
export VLLM_ROCM_USE_AITER=1           # Enable optimized AITER attention backends
export HIP_FORCE_DEV_KERNARG=1          # Kernel launch performance
export TORCH_BLAS_PREFER_HIPBLASLT=1    # Better BLAS performance

DTYPE_ARGS="--dtype bfloat16 --kv-cache-dtype fp8_e4m3"
if [ "$FP8" = "1" ]; then
    DTYPE_ARGS="--dtype fp8_e4m3"
    echo "*** FP8 mode: model weights AND KV cache in FP8 ***"
else
    echo "*** BF16 mode: model in BF16, KV cache in FP8 ***"
fi

exec vllm serve "$MODEL" \
    --port "$PORT" \
    $DTYPE_ARGS \
    --max-model-len "$MAX_MODEL_LEN" \
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
    --enable-prefix-caching \
    --enable-auto-tool-choice \
    --tool-call-parser qwen35_coder \
    --reasoning-parser qwen3 \
    --trust-remote-code \
    --max-cudagraph-capture-size 64 \
    --uvicorn-log-level warning
Add MI300X provisioning script for vllm/Qwen 3.5 27B ROCm-specific setup with: - AITER attention backends (VLLM_ROCM_USE_AITER=1) - Reduced cudagraph capture size (DeltaNet cache conflict) - BF16 model + FP8 KV cache as default (FP8 weights can be slower on MI300X due to ROCm kernel maturity) - FP8=1 flag for benchmarking FP8 model weights Key for training plan: if FP8 matmuls are slow on MI300X, the quantize-and-expand strategy needs B200 instead. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-19 14:40:15 -04:00			`#!/bin/bash`
			`# provision-mi300x.sh — Set up vllm on an MI300X GPU instance (ROCm)`
			`#`
			`# Usage: ssh into your instance and run this script.`
			`#`
			`# Expects: AMD MI300X GPU with ROCm drivers`
			`# Installs: vllm (ROCm wheels) with Qwen 3.5 27B`
			`# Exposes: OpenAI-compatible API on port 8000`
			`#`
			`# Key differences from B200/CUDA setup:`
			`# - ROCm wheels from wheels.vllm.ai/rocm`
			`# - AITER attention backends (2.7-4.4x speedup)`
			`# - Reduced cudagraph capture size (DeltaNet cache conflict)`
			`# - BF16 model + FP8 KV cache (FP8 weights can be slower on MI300X)`

			`set -euo pipefail`

			`MODEL="${MODEL:-Qwen/Qwen3.5-27B}"`
			`PORT="${PORT:-8000}"`
			`MAX_MODEL_LEN="${MAX_MODEL_LEN:-131072}"`
			`GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"`
			`# Set FP8=1 to use FP8 model weights (for benchmarking vs BF16)`
			`FP8="${FP8:-0}"`

			`echo "=== MI300X vllm provisioning ==="`
			`echo "Model: $MODEL"`
			`echo "Port: $PORT"`
			`echo "Max context: $MAX_MODEL_LEN"`
			`echo ""`

			`# --- Check for ROCm ---`
			`if ! command -v rocm-smi &>/dev/null; then`
			`echo "ERROR: rocm-smi not found. Is ROCm installed?"`
			`exit 1`
			`fi`

			`echo "GPU status:"`
			`rocm-smi --showproductname --showmeminfo vram 2>/dev/null \|\| rocm-smi`
			`echo ""`

			`# --- Install vllm (ROCm wheels) ---`
			`echo "Installing vllm (ROCm)..."`
			`pip install --upgrade vllm \`
			`--extra-index-url https://wheels.vllm.ai/rocm \`
			`--break-system-packages 2>&1 \| tail -5`

			`# --- Use persistent storage if available ---`
			`if [ -d /workspace ]; then`
			`export HF_HOME=/workspace/huggingface`
			`echo "Using persistent storage: $HF_HOME"`
			`fi`

			`# --- Download model ---`
			`echo ""`
			`echo "Downloading model (this may take a while on first run)..."`
			`pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null`
			`python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 \| tail -5`
			`echo ""`

			`# --- Launch vllm ---`
			`echo "Starting vllm server on port $PORT..."`
			`echo "API will be available at http://0.0.0.0:$PORT/v1"`
			`echo ""`

			`# ROCm-specific environment variables`
			`export VLLM_ROCM_USE_AITER=1 # Enable optimized AITER attention backends`
			`export HIP_FORCE_DEV_KERNARG=1 # Kernel launch performance`
			`export TORCH_BLAS_PREFER_HIPBLASLT=1 # Better BLAS performance`

			`DTYPE_ARGS="--dtype bfloat16 --kv-cache-dtype fp8_e4m3"`
			`if [ "$FP8" = "1" ]; then`
			`DTYPE_ARGS="--dtype fp8_e4m3"`
			`echo "* FP8 mode: model weights AND KV cache in FP8 *"`
			`else`
			`echo "* BF16 mode: model in BF16, KV cache in FP8 *"`
			`fi`

			`exec vllm serve "$MODEL" \`
			`--port "$PORT" \`
			`$DTYPE_ARGS \`
			`--max-model-len "$MAX_MODEL_LEN" \`
			`--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \`
			`--enable-prefix-caching \`
			`--enable-auto-tool-choice \`
Consolidate poc-memory and poc-agent configs poc-memory now reads from poc-agent's config.json5 as the primary config source. Memory-specific settings live in a "memory" section; API credentials are resolved from the shared model/backend config instead of being duplicated. - Add "memory" section to ~/.config/poc-agent/config.json5 - poc-memory config.rs: try shared config first, fall back to legacy JSONL - API fields (base_url, api_key, model) resolved via memory.agent_model -> models -> backend lookup - Add json5 dependency for proper JSON5 parsing - Update provisioning scripts: hermes -> qwen3_coder tool parser Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-19 21:49:58 -04:00			`--tool-call-parser qwen35_coder \`
Add MI300X provisioning script for vllm/Qwen 3.5 27B ROCm-specific setup with: - AITER attention backends (VLLM_ROCM_USE_AITER=1) - Reduced cudagraph capture size (DeltaNet cache conflict) - BF16 model + FP8 KV cache as default (FP8 weights can be slower on MI300X due to ROCm kernel maturity) - FP8=1 flag for benchmarking FP8 model weights Key for training plan: if FP8 matmuls are slow on MI300X, the quantize-and-expand strategy needs B200 instead. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-19 14:40:15 -04:00			`--reasoning-parser qwen3 \`
			`--trust-remote-code \`
			`--max-cudagraph-capture-size 64 \`
			`--uvicorn-log-level warning`