"""Monkey-patch vLLM to export weight IPC handles on startup. Usage — add to start_vllm.sh BEFORE the vllm serve command: export VLLM_PLUGINS=vllm_export_hook vllm serve Qwen/Qwen3.5-27B ... Or use Python to launch vLLM with the hook: python3 -c " import vllm_export_hook # installs the patch from vllm.entrypoints.openai.api_server import run_server run_server(...) " The hook patches vLLM's model runner to export IPC handles after model loading completes. The handles are saved to a file that the Apollo training process reads. """ import atexit import torch from pathlib import Path HANDLE_PATH = "/tmp/vllm_weight_handles.pt" def export_model_weights(model): """Export CUDA IPC handles for all model parameters.""" from torch.multiprocessing.reductions import reduce_tensor handles = {} total_bytes = 0 for name, param in model.named_parameters(): if param.device.type != 'cuda': continue handle = reduce_tensor(param.data) handles[name] = { 'handle': handle, 'shape': list(param.shape), 'dtype': str(param.dtype), } total_bytes += param.nelement() * param.element_size() torch.save(handles, HANDLE_PATH) print(f"[apollo] Exported {len(handles)} weight handles " f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}") def _patch_model_runner(): """Patch gpu_model_runner to export handles after load_model.""" from vllm.v1.worker import gpu_model_runner original_load = gpu_model_runner.GPUModelRunner.load_model def patched_load(self, *args, **kwargs): result = original_load(self, *args, **kwargs) try: export_model_weights(self.model) except Exception as e: print(f"[apollo] Failed to export weights: {e}") return result gpu_model_runner.GPUModelRunner.load_model = patched_load print("[apollo] Weight export hook installed") # Auto-install when imported _patch_model_runner()