Rust tool that mmaps previous checkpoint, diffs against live GPU weights (via CUDA IPC handles), and only writes changed blocks. For small behavioral training steps, turns 54GB write into ~500MB. Also includes vllm_export_hook.py with direct source patch approach — exports IPC handles from vLLM's worker subprocess after model load. Run every 10 minutes via cron to protect against vLLM crashes. Daily rsync to moria for long-term storage.
13 lines
266 B
TOML
13 lines
266 B
TOML
[package]
|
|
name = "apollo-checkpoint"
|
|
version = "0.1.0"
|
|
edition = "2024"
|
|
|
|
[dependencies]
|
|
memmap2 = "0.9"
|
|
safetensors = "0.5"
|
|
serde = { version = "1", features = ["derive"] }
|
|
serde_json = "1"
|
|
anyhow = "1"
|
|
clap = { version = "4", features = ["derive"] }
|
|
chrono = "0.4"
|