forked from kent/consciousness
training: restructure as vLLM plugin package
- Convert to installable package with entry points for vLLM auto-discovery - Add checkpoint_sync.py: Python replacement for Rust checkpoint binary - Block-level diffing of safetensors files (4KB blocks) - vLLM→HF weight name conversion built-in - Scheduled 10min after training jobs (batched) - API change: /train now takes raw token IDs (context_ids + continuation_ids) - No tokenizer on training side, client owns tokenization - Remove superseded code: standalone scripts, Rust binary, tokenizer helpers Install: pip install -e ./training Then vLLM auto-loads via entry point. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
b649a11645
commit
a73bcf5ae3
15 changed files with 607 additions and 1068 deletions
|
|
@ -1,76 +0,0 @@
|
|||
"""Monkey-patch vLLM to export weight IPC handles on startup.
|
||||
|
||||
Usage — add to start_vllm.sh BEFORE the vllm serve command:
|
||||
|
||||
export VLLM_PLUGINS=vllm_export_hook
|
||||
vllm serve Qwen/Qwen3.5-27B ...
|
||||
|
||||
Or use Python to launch vLLM with the hook:
|
||||
|
||||
python3 -c "
|
||||
import vllm_export_hook # installs the patch
|
||||
from vllm.entrypoints.openai.api_server import run_server
|
||||
run_server(...)
|
||||
"
|
||||
|
||||
The hook patches vLLM's model runner to export IPC handles after
|
||||
model loading completes. The handles are saved to a file that the
|
||||
Apollo training process reads.
|
||||
"""
|
||||
|
||||
import atexit
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
HANDLE_PATH = "/tmp/vllm_weight_handles.pt"
|
||||
|
||||
|
||||
def export_model_weights(model):
|
||||
"""Export CUDA IPC handles for all model parameters."""
|
||||
from torch.multiprocessing.reductions import reduce_tensor
|
||||
|
||||
handles = {}
|
||||
total_bytes = 0
|
||||
|
||||
for name, param in model.named_parameters():
|
||||
if param.device.type != 'cuda':
|
||||
continue
|
||||
handle = reduce_tensor(param.data)
|
||||
handles[name] = {
|
||||
'handle': handle,
|
||||
'shape': list(param.shape),
|
||||
'dtype': str(param.dtype),
|
||||
}
|
||||
total_bytes += param.nelement() * param.element_size()
|
||||
|
||||
torch.save(handles, HANDLE_PATH)
|
||||
print(f"[apollo] Exported {len(handles)} weight handles "
|
||||
f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")
|
||||
|
||||
|
||||
def _patch_model_runner():
|
||||
"""Patch gpu_worker to export handles after model loading.
|
||||
|
||||
vLLM loads the model in a subprocess (EngineCore_DP0), so we
|
||||
can't patch from the parent. Instead, patch the worker's
|
||||
init_device or load_model at the module level — the subprocess
|
||||
imports the same modules.
|
||||
"""
|
||||
from vllm.v1.worker import gpu_worker
|
||||
|
||||
original_load = gpu_worker.Worker.load_model
|
||||
|
||||
def patched_load(self, *args, **kwargs):
|
||||
result = original_load(self, *args, **kwargs)
|
||||
try:
|
||||
export_model_weights(self.model_runner.model)
|
||||
except Exception as e:
|
||||
print(f"[apollo] Failed to export weights: {e}")
|
||||
return result
|
||||
|
||||
gpu_worker.Worker.load_model = patched_load
|
||||
print("[apollo] Weight export hook installed")
|
||||
|
||||
|
||||
# Auto-install when imported
|
||||
_patch_model_runner()
|
||||
Loading…
Add table
Add a link
Reference in a new issue