"""Monkey-patch vLLM to export weight IPC handles on startup. Usage — install the apollo_plugin package: pip install -e /path/to/training Then vLLM auto-discovers and loads via entry point. Or filter: VLLM_PLUGINS=apollo vllm serve Qwen/Qwen3.5-27B ... The hook patches vLLM's model runner to export IPC handles after model loading completes. The handles are saved to a file that the Apollo training process reads. """ import atexit import torch from pathlib import Path HANDLE_PATH = "/tmp/vllm_weight_handles.pt" def export_model_weights(model, model_path: str | None = None): """Export CUDA IPC handles for all model parameters.""" from torch.multiprocessing.reductions import reduce_tensor handles = {} total_bytes = 0 for name, param in model.named_parameters(): if param.device.type != 'cuda': continue handle = reduce_tensor(param.data) handles[name] = { 'handle': handle, 'shape': list(param.shape), 'dtype': str(param.dtype), } total_bytes += param.nelement() * param.element_size() # Include metadata for training worker handles['__metadata__'] = { 'model_path': model_path, 'num_params': len(handles), } torch.save(handles, HANDLE_PATH) print(f"[apollo] Exported {len(handles)} weight handles " f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}") def _patch_model_runner(): """Patch gpu_worker to export handles after model loading. vLLM loads the model in a subprocess (EngineCore_DP0), so we can't patch from the parent. Instead, patch the worker's init_device or load_model at the module level — the subprocess imports the same modules. """ from vllm.v1.worker import gpu_worker original_load = gpu_worker.Worker.load_model def patched_load(self, *args, **kwargs): result = original_load(self, *args, **kwargs) try: model_path = self.vllm_config.model_config.model export_model_weights(self.model_runner.model, model_path) except Exception as e: print(f"[apollo] Failed to export weights: {e}") return result gpu_worker.Worker.load_model = patched_load print("[apollo] Weight export hook installed")