- Add training_worker.py: long-lived subprocess that handles GPU training
work, owns HF model wrapper (views into vLLM GPU memory), Apollo
optimizer, and checkpoint sync
- train_router.py: now forwards /train requests via async ZMQ instead of
running training in-process. Adds /checkpoint and /train/status endpoints
- export_hook.py: store model_path in __metadata__ so training worker can
find it without cross-process communication
- This fixes two bugs:
1. Process boundary issue - model_path was set in worker process but
needed in API server process
2. Blocking event loop - training blocked vLLM's async event loop
Architecture: vLLM API server <-> ZMQ <-> training subprocess
The subprocess loads IPC handles once, creates views into vLLM's GPU
memory, and handles training requests without blocking inference.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
29 lines
629 B
TOML
29 lines
629 B
TOML
[build-system]
|
|
requires = ["setuptools>=61.0"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "apollo-plugin"
|
|
version = "0.1.0"
|
|
description = "Apollo training plugin for vLLM"
|
|
requires-python = ">=3.10"
|
|
dependencies = [
|
|
"torch",
|
|
"aiohttp",
|
|
"safetensors",
|
|
"pyzmq",
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
dev = ["pytest"]
|
|
|
|
[project.entry-points."vllm.general_plugins"]
|
|
apollo = "apollo_plugin:register"
|
|
|
|
[project.scripts]
|
|
apollo-checkpoint = "apollo_plugin.checkpoint_sync:main"
|
|
apollo-worker = "apollo_plugin.training_worker:main"
|
|
|
|
[tool.setuptools.packages.find]
|
|
where = ["."]
|
|
include = ["apollo_plugin*"]
|