training: move to dedicated subprocess with ZMQ communication

- Add training_worker.py: long-lived subprocess that handles GPU training work, owns HF model wrapper (views into vLLM GPU memory), Apollo optimizer, and checkpoint sync - train_router.py: now forwards /train requests via async ZMQ instead of running training in-process. Adds /checkpoint and /train/status endpoints - export_hook.py: store model_path in __metadata__ so training worker can find it without cross-process communication - This fixes two bugs: 1. Process boundary issue - model_path was set in worker process but needed in API server process 2. Blocking event loop - training blocked vLLM's async event loop Architecture: vLLM API server <-> ZMQ <-> training subprocess The subprocess loads IPC handles once, creates views into vLLM's GPU memory, and handles training requests without blocking inference. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-16 02:01:59 -04:00 · 2026-04-16 02:01:59 -04:00 · 2c6a5c0f4a
commit 2c6a5c0f4a
parent 68a2df2185
6 changed files with 503 additions and 233 deletions
--- a/training/apollo_plugin/export_hook.py
+++ b/training/apollo_plugin/export_hook.py
@ -20,7 +20,7 @@ from pathlib import Path
 HANDLE_PATH = "/tmp/vllm_weight_handles.pt"


-def export_model_weights(model):
+def export_model_weights(model, model_path: str | None = None):
    """Export CUDA IPC handles for all model parameters."""
    from torch.multiprocessing.reductions import reduce_tensor

@ -38,6 +38,12 @@ def export_model_weights(model):
        }
        total_bytes += param.nelement() * param.element_size()

+    # Include metadata for training worker
+    handles['__metadata__'] = {
+        'model_path': model_path,
+        'num_params': len(handles),
+    }
+
    torch.save(handles, HANDLE_PATH)
    print(f"[apollo] Exported {len(handles)} weight handles "
          f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
@ -58,11 +64,8 @@ def _patch_model_runner():
    def patched_load(self, *args, **kwargs):
        result = original_load(self, *args, **kwargs)
        try:
-            export_model_weights(self.model_runner.model)
-            # Set model path for training router
            model_path = self.vllm_config.model_config.model
-            from .train_router import set_model_path
-            set_model_path(model_path)
+            export_model_weights(self.model_runner.model, model_path)
        except Exception as e:
            print(f"[apollo] Failed to export weights: {e}")
        return result