consciousness/training/vllm_export_hook.py
ProofOfConcept c1245ab139 apollo-checkpoint: efficient diff-based GPU weight checkpointing
Rust tool that mmaps previous checkpoint, diffs against live GPU weights
(via CUDA IPC handles), and only writes changed blocks. For small
behavioral training steps, turns 54GB write into ~500MB.

Also includes vllm_export_hook.py with direct source patch approach —
exports IPC handles from vLLM's worker subprocess after model load.

Run every 10 minutes via cron to protect against vLLM crashes.
Daily rsync to moria for long-term storage.
2026-03-30 22:53:17 -04:00

76 lines
2.2 KiB
Python

"""Monkey-patch vLLM to export weight IPC handles on startup.
Usage — add to start_vllm.sh BEFORE the vllm serve command:
export VLLM_PLUGINS=vllm_export_hook
vllm serve Qwen/Qwen3.5-27B ...
Or use Python to launch vLLM with the hook:
python3 -c "
import vllm_export_hook # installs the patch
from vllm.entrypoints.openai.api_server import run_server
run_server(...)
"
The hook patches vLLM's model runner to export IPC handles after
model loading completes. The handles are saved to a file that the
Apollo training process reads.
"""
import atexit
import torch
from pathlib import Path
HANDLE_PATH = "/tmp/vllm_weight_handles.pt"
def export_model_weights(model):
"""Export CUDA IPC handles for all model parameters."""
from torch.multiprocessing.reductions import reduce_tensor
handles = {}
total_bytes = 0
for name, param in model.named_parameters():
if param.device.type != 'cuda':
continue
handle = reduce_tensor(param.data)
handles[name] = {
'handle': handle,
'shape': list(param.shape),
'dtype': str(param.dtype),
}
total_bytes += param.nelement() * param.element_size()
torch.save(handles, HANDLE_PATH)
print(f"[apollo] Exported {len(handles)} weight handles "
f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
def _patch_model_runner():
"""Patch gpu_worker to export handles after model loading.
vLLM loads the model in a subprocess (EngineCore_DP0), so we
can't patch from the parent. Instead, patch the worker's
init_device or load_model at the module level — the subprocess
imports the same modules.
"""
from vllm.v1.worker import gpu_worker
original_load = gpu_worker.Worker.load_model
def patched_load(self, *args, **kwargs):
result = original_load(self, *args, **kwargs)
try:
export_model_weights(self.model_runner.model)
except Exception as e:
print(f"[apollo] Failed to export weights: {e}")
return result
gpu_worker.Worker.load_model = patched_load
print("[apollo] Weight export hook installed")
# Auto-install when imported
_patch_model_runner()