- Add training_worker.py: long-lived subprocess that handles GPU training
work, owns HF model wrapper (views into vLLM GPU memory), Apollo
optimizer, and checkpoint sync
- train_router.py: now forwards /train requests via async ZMQ instead of
running training in-process. Adds /checkpoint and /train/status endpoints
- export_hook.py: store model_path in __metadata__ so training worker can
find it without cross-process communication
- This fixes two bugs:
1. Process boundary issue - model_path was set in worker process but
needed in API server process
2. Blocking event loop - training blocked vLLM's async event loop
Architecture: vLLM API server <-> ZMQ <-> training subprocess
The subprocess loads IPC handles once, creates views into vLLM's GPU
memory, and handles training requests without blocking inference.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
"""Monkey-patch vLLM to export weight IPC handles on startup.
|
|
|
|
Usage — install the apollo_plugin package:
|
|
|
|
pip install -e /path/to/training
|
|
|
|
Then vLLM auto-discovers and loads via entry point. Or filter:
|
|
|
|
VLLM_PLUGINS=apollo vllm serve Qwen/Qwen3.5-27B ...
|
|
|
|
The hook patches vLLM's model runner to export IPC handles after
|
|
model loading completes. The handles are saved to a file that the
|
|
Apollo training process reads.
|
|
"""
|
|
|
|
import atexit
|
|
import torch
|
|
from pathlib import Path
|
|
|
|
HANDLE_PATH = "/tmp/vllm_weight_handles.pt"
|
|
|
|
|
|
def export_model_weights(model, model_path: str | None = None):
|
|
"""Export CUDA IPC handles for all model parameters."""
|
|
from torch.multiprocessing.reductions import reduce_tensor
|
|
|
|
handles = {}
|
|
total_bytes = 0
|
|
|
|
for name, param in model.named_parameters():
|
|
if param.device.type != 'cuda':
|
|
continue
|
|
handle = reduce_tensor(param.data)
|
|
handles[name] = {
|
|
'handle': handle,
|
|
'shape': list(param.shape),
|
|
'dtype': str(param.dtype),
|
|
}
|
|
total_bytes += param.nelement() * param.element_size()
|
|
|
|
# Include metadata for training worker
|
|
handles['__metadata__'] = {
|
|
'model_path': model_path,
|
|
'num_params': len(handles),
|
|
}
|
|
|
|
torch.save(handles, HANDLE_PATH)
|
|
print(f"[apollo] Exported {len(handles)} weight handles "
|
|
f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
|
|
|
|
|
|
def _patch_model_runner():
|
|
"""Patch gpu_worker to export handles after model loading.
|
|
|
|
vLLM loads the model in a subprocess (EngineCore_DP0), so we
|
|
can't patch from the parent. Instead, patch the worker's
|
|
init_device or load_model at the module level — the subprocess
|
|
imports the same modules.
|
|
"""
|
|
from vllm.v1.worker import gpu_worker
|
|
|
|
original_load = gpu_worker.Worker.load_model
|
|
|
|
def patched_load(self, *args, **kwargs):
|
|
result = original_load(self, *args, **kwargs)
|
|
try:
|
|
model_path = self.vllm_config.model_config.model
|
|
export_model_weights(self.model_runner.model, model_path)
|
|
except Exception as e:
|
|
print(f"[apollo] Failed to export weights: {e}")
|
|
return result
|
|
|
|
gpu_worker.Worker.load_model = patched_load
|
|
print("[apollo] Weight export hook installed")
|