From 0402a9333c8fcbc762abce32299a850c557ea027 Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Mon, 30 Mar 2026 22:20:04 -0400 Subject: [PATCH] vllm weight export hook: monkey-patches model runner to save IPC handles on load --- training/vllm_export_hook.py | 70 ++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 training/vllm_export_hook.py diff --git a/training/vllm_export_hook.py b/training/vllm_export_hook.py new file mode 100644 index 0000000..8576faf --- /dev/null +++ b/training/vllm_export_hook.py @@ -0,0 +1,70 @@ +"""Monkey-patch vLLM to export weight IPC handles on startup. + +Usage — add to start_vllm.sh BEFORE the vllm serve command: + + export VLLM_PLUGINS=vllm_export_hook + vllm serve Qwen/Qwen3.5-27B ... + +Or use Python to launch vLLM with the hook: + + python3 -c " + import vllm_export_hook # installs the patch + from vllm.entrypoints.openai.api_server import run_server + run_server(...) + " + +The hook patches vLLM's model runner to export IPC handles after +model loading completes. The handles are saved to a file that the +Apollo training process reads. +""" + +import atexit +import torch +from pathlib import Path + +HANDLE_PATH = "/tmp/vllm_weight_handles.pt" + + +def export_model_weights(model): + """Export CUDA IPC handles for all model parameters.""" + from torch.multiprocessing.reductions import reduce_tensor + + handles = {} + total_bytes = 0 + + for name, param in model.named_parameters(): + if param.device.type != 'cuda': + continue + handle = reduce_tensor(param.data) + handles[name] = { + 'handle': handle, + 'shape': list(param.shape), + 'dtype': str(param.dtype), + } + total_bytes += param.nelement() * param.element_size() + + torch.save(handles, HANDLE_PATH) + print(f"[apollo] Exported {len(handles)} weight handles " + f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}") + + +def _patch_model_runner(): + """Patch gpu_model_runner to export handles after load_model.""" + from vllm.v1.worker import gpu_model_runner + + original_load = gpu_model_runner.GPUModelRunner.load_model + + def patched_load(self, *args, **kwargs): + result = original_load(self, *args, **kwargs) + try: + export_model_weights(self.model) + except Exception as e: + print(f"[apollo] Failed to export weights: {e}") + return result + + gpu_model_runner.GPUModelRunner.load_model = patched_load + print("[apollo] Weight export hook installed") + + +# Auto-install when imported +_patch_model_runner()