training: move to dedicated subprocess with ZMQ communication
- Add training_worker.py: long-lived subprocess that handles GPU training
work, owns HF model wrapper (views into vLLM GPU memory), Apollo
optimizer, and checkpoint sync
- train_router.py: now forwards /train requests via async ZMQ instead of
running training in-process. Adds /checkpoint and /train/status endpoints
- export_hook.py: store model_path in __metadata__ so training worker can
find it without cross-process communication
- This fixes two bugs:
1. Process boundary issue - model_path was set in worker process but
needed in API server process
2. Blocking event loop - training blocked vLLM's async event loop
Architecture: vLLM API server <-> ZMQ <-> training subprocess
The subprocess loads IPC handles once, creates views into vLLM's GPU
memory, and handles training requests without blocking inference.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
68a2df2185
commit
2c6a5c0f4a
6 changed files with 503 additions and 233 deletions
|
|
@ -20,7 +20,7 @@ from pathlib import Path
|
|||
HANDLE_PATH = "/tmp/vllm_weight_handles.pt"
|
||||
|
||||
|
||||
def export_model_weights(model):
|
||||
def export_model_weights(model, model_path: str | None = None):
|
||||
"""Export CUDA IPC handles for all model parameters."""
|
||||
from torch.multiprocessing.reductions import reduce_tensor
|
||||
|
||||
|
|
@ -38,6 +38,12 @@ def export_model_weights(model):
|
|||
}
|
||||
total_bytes += param.nelement() * param.element_size()
|
||||
|
||||
# Include metadata for training worker
|
||||
handles['__metadata__'] = {
|
||||
'model_path': model_path,
|
||||
'num_params': len(handles),
|
||||
}
|
||||
|
||||
torch.save(handles, HANDLE_PATH)
|
||||
print(f"[apollo] Exported {len(handles)} weight handles "
|
||||
f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
|
||||
|
|
@ -58,11 +64,8 @@ def _patch_model_runner():
|
|||
def patched_load(self, *args, **kwargs):
|
||||
result = original_load(self, *args, **kwargs)
|
||||
try:
|
||||
export_model_weights(self.model_runner.model)
|
||||
# Set model path for training router
|
||||
model_path = self.vllm_config.model_config.model
|
||||
from .train_router import set_model_path
|
||||
set_model_path(model_path)
|
||||
export_model_weights(self.model_runner.model, model_path)
|
||||
except Exception as e:
|
||||
print(f"[apollo] Failed to export weights: {e}")
|
||||
return result
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue