vllm weight export hook: monkey-patches model runner to save IPC handles on load

2026-03-30 22:20:04 -04:00 · 2026-03-30 22:20:04 -04:00 · 0402a9333c
commit 0402a9333c
parent 8e7b4a22db
1 changed files with 70 additions and 0 deletions
--- a/training/vllm_export_hook.py
+++ b/training/vllm_export_hook.py
@ -0,0 +1,70 @@
+"""Monkey-patch vLLM to export weight IPC handles on startup.
+
+Usage — add to start_vllm.sh BEFORE the vllm serve command:
+
+    export VLLM_PLUGINS=vllm_export_hook
+    vllm serve Qwen/Qwen3.5-27B ...
+
+Or use Python to launch vLLM with the hook:
+
+    python3 -c "
+    import vllm_export_hook  # installs the patch
+    from vllm.entrypoints.openai.api_server import run_server
+    run_server(...)
+    "
+
+The hook patches vLLM's model runner to export IPC handles after
+model loading completes. The handles are saved to a file that the
+Apollo training process reads.
+"""
+
+import atexit
+import torch
+from pathlib import Path
+
+HANDLE_PATH = "/tmp/vllm_weight_handles.pt"
+
+
+def export_model_weights(model):
+    """Export CUDA IPC handles for all model parameters."""
+    from torch.multiprocessing.reductions import reduce_tensor
+
+    handles = {}
+    total_bytes = 0
+
+    for name, param in model.named_parameters():
+        if param.device.type != 'cuda':
+            continue
+        handle = reduce_tensor(param.data)
+        handles[name] = {
+            'handle': handle,
+            'shape': list(param.shape),
+            'dtype': str(param.dtype),
+        }
+        total_bytes += param.nelement() * param.element_size()
+
+    torch.save(handles, HANDLE_PATH)
+    print(f"[apollo] Exported {len(handles)} weight handles "
+          f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
+
+
+def _patch_model_runner():
+    """Patch gpu_model_runner to export handles after load_model."""
+    from vllm.v1.worker import gpu_model_runner
+
+    original_load = gpu_model_runner.GPUModelRunner.load_model
+
+    def patched_load(self, *args, **kwargs):
+        result = original_load(self, *args, **kwargs)
+        try:
+            export_model_weights(self.model)
+        except Exception as e:
+            print(f"[apollo] Failed to export weights: {e}")
+        return result
+
+    gpu_model_runner.GPUModelRunner.load_model = patched_load
+    print("[apollo] Weight export hook installed")
+
+
+# Auto-install when imported
+_patch_model_runner()