training: use rank 64, define as single constant
- DEFAULT_RANK = 64 in train_router.py - All references use the constant, not magic numbers - ~2.5GB optimizer state instead of ~10GB Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
039473d31f
commit
68a2df2185
3 changed files with 16 additions and 16 deletions
|
|
@ -42,6 +42,7 @@ _initialized: bool = False
|
|||
_optimizer: Any = None # Persisted Apollo optimizer
|
||||
|
||||
OPTIMIZER_STATE_PATH = "/tmp/apollo_optimizer_state.pt"
|
||||
DEFAULT_RANK = 64
|
||||
|
||||
|
||||
def _load_training_model() -> nn.Module:
|
||||
|
|
@ -150,7 +151,7 @@ def _get_or_create_optimizer(model: nn.Module, config: dict[str, Any]):
|
|||
apollo_params, standard_params = [], []
|
||||
for p in model.parameters():
|
||||
if p.requires_grad:
|
||||
if p.ndim >= 2 and min(p.shape) >= 256:
|
||||
if p.ndim >= 2 and min(p.shape) >= DEFAULT_RANK:
|
||||
apollo_params.append(p)
|
||||
else:
|
||||
standard_params.append(p)
|
||||
|
|
@ -168,7 +169,7 @@ def _get_or_create_optimizer(model: nn.Module, config: dict[str, Any]):
|
|||
_optimizer = Apollo(
|
||||
groups,
|
||||
lr=config.get('lr', 1e-5),
|
||||
rank=config.get('rank', 256),
|
||||
rank=config.get('rank', DEFAULT_RANK),
|
||||
betas=tuple(config.get('betas', (0.9, 0.999))),
|
||||
eps=config.get('eps', 1e-8),
|
||||
weight_decay=config.get('weight_decay', 0.01),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue