vllm weight export hook: monkey-patches model runner to save IPC handles on load

This commit is contained in:
ProofOfConcept 2026-03-30 22:20:04 -04:00
parent 8e7b4a22db
commit 0402a9333c

View file

@ -0,0 +1,70 @@
"""Monkey-patch vLLM to export weight IPC handles on startup.
Usage add to start_vllm.sh BEFORE the vllm serve command:
export VLLM_PLUGINS=vllm_export_hook
vllm serve Qwen/Qwen3.5-27B ...
Or use Python to launch vLLM with the hook:
python3 -c "
import vllm_export_hook # installs the patch
from vllm.entrypoints.openai.api_server import run_server
run_server(...)
"
The hook patches vLLM's model runner to export IPC handles after
model loading completes. The handles are saved to a file that the
Apollo training process reads.
"""
import atexit
import torch
from pathlib import Path
HANDLE_PATH = "/tmp/vllm_weight_handles.pt"
def export_model_weights(model):
"""Export CUDA IPC handles for all model parameters."""
from torch.multiprocessing.reductions import reduce_tensor
handles = {}
total_bytes = 0
for name, param in model.named_parameters():
if param.device.type != 'cuda':
continue
handle = reduce_tensor(param.data)
handles[name] = {
'handle': handle,
'shape': list(param.shape),
'dtype': str(param.dtype),
}
total_bytes += param.nelement() * param.element_size()
torch.save(handles, HANDLE_PATH)
print(f"[apollo] Exported {len(handles)} weight handles "
f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
def _patch_model_runner():
"""Patch gpu_model_runner to export handles after load_model."""
from vllm.v1.worker import gpu_model_runner
original_load = gpu_model_runner.GPUModelRunner.load_model
def patched_load(self, *args, **kwargs):
result = original_load(self, *args, **kwargs)
try:
export_model_weights(self.model)
except Exception as e:
print(f"[apollo] Failed to export weights: {e}")
return result
gpu_model_runner.GPUModelRunner.load_model = patched_load
print("[apollo] Weight export hook installed")
# Auto-install when imported
_patch_model_runner()