apollo-checkpoint: efficient diff-based GPU weight checkpointing
Rust tool that mmaps previous checkpoint, diffs against live GPU weights (via CUDA IPC handles), and only writes changed blocks. For small behavioral training steps, turns 54GB write into ~500MB. Also includes vllm_export_hook.py with direct source patch approach — exports IPC handles from vLLM's worker subprocess after model load. Run every 10 minutes via cron to protect against vLLM crashes. Daily rsync to moria for long-term storage.
This commit is contained in:
parent
5f41898bb8
commit
c1245ab139
3 changed files with 305 additions and 5 deletions
|
|
@ -49,20 +49,26 @@ def export_model_weights(model):
|
|||
|
||||
|
||||
def _patch_model_runner():
|
||||
"""Patch gpu_model_runner to export handles after load_model."""
|
||||
from vllm.v1.worker import gpu_model_runner
|
||||
"""Patch gpu_worker to export handles after model loading.
|
||||
|
||||
original_load = gpu_model_runner.GPUModelRunner.load_model
|
||||
vLLM loads the model in a subprocess (EngineCore_DP0), so we
|
||||
can't patch from the parent. Instead, patch the worker's
|
||||
init_device or load_model at the module level — the subprocess
|
||||
imports the same modules.
|
||||
"""
|
||||
from vllm.v1.worker import gpu_worker
|
||||
|
||||
original_load = gpu_worker.Worker.load_model
|
||||
|
||||
def patched_load(self, *args, **kwargs):
|
||||
result = original_load(self, *args, **kwargs)
|
||||
try:
|
||||
export_model_weights(self.model)
|
||||
export_model_weights(self.model_runner.model)
|
||||
except Exception as e:
|
||||
print(f"[apollo] Failed to export weights: {e}")
|
||||
return result
|
||||
|
||||
gpu_model_runner.GPUModelRunner.load_model = patched_load
|
||||
gpu_worker.Worker.load_model = patched_load
|
||||
print("[apollo] Weight export hook installed")
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue