diff --git a/training/DESIGN.md b/training/DESIGN.md index 5b7fe30..556525f 100644 --- a/training/DESIGN.md +++ b/training/DESIGN.md @@ -3,7 +3,7 @@ ## Overview Continuous fine-tuning of Qwen3.5-27B alongside live vLLM inference. -Full-weight updates (not LoRA) using Apollo optimizer with rank-256 +Full-weight updates (not LoRA) using Apollo optimizer with rank-64 gradient projection. No pause required — HOGWILD concurrent training. Weights shared via CUDA IPC between vLLM and the training process. @@ -63,10 +63,9 @@ LoRA trains adapter matrices, not base weights. For personality and behavioral changes that persist as disposition, the base weights need to change. Apollo makes this memory-feasible. -### Rank 256 -Not Mini (rank-1). With 100+ diverse training examples, the -gradient's effective dimensionality can reach hundreds. Rank-256 -captures the structure. Memory cost: ~10GB (negligible on B200). +### Rank 64 +Not Mini (rank-1). Rank-64 captures gradient structure across diverse +training examples while keeping memory low (~2.5GB on 27B model). Compute cost: <0.25% of forward+backward. ### Channel-wise scaling @@ -94,7 +93,7 @@ from a per-parameter seed each step. ### Parameter grouping (Qwen3.5 gotcha) conv1d weights are 3D tensors [10240, 1, 4]. Apollo's projector needs 2D matrices with min dimension >= rank. Small/3D tensors -use standard Adam. Large 2D matrices use Apollo with rank-256. +use standard Adam. Large 2D matrices use Apollo. ## Training Data Pipeline @@ -229,7 +228,7 @@ a few hundred MB. | State | Location | Notes | |-------|----------|-------| -| Apollo optimizer | train_router._optimizer | ~10GB for rank-256. Persisted to `/tmp/apollo_optimizer_state.pt` during checkpoint sync. | +| Apollo optimizer | train_router._optimizer | ~2.5GB for rank-64. Persisted to `/tmp/apollo_optimizer_state.pt` during checkpoint sync. | | HF model with vLLM views | train_router._model | Lazy-loaded on first /train. Parameters point to vLLM's GPU memory. | ## Hyperparameters @@ -237,7 +236,7 @@ a few hundred MB. | Parameter | Value | Rationale | |-----------|-------|-----------| | Learning rate | 1e-5 to 1e-4 | Standard for full fine-tuning. Higher for diverse batches. | -| Rank | 256 | Captures gradient structure across 100+ examples. ~10GB state. | +| Rank | 64 | Captures gradient structure. ~2.5GB state. Defined in `train_router.DEFAULT_RANK`. | | Scale type | channel | Per-channel precision, matches LLaMA-Factory defaults. | | Epochs | 1 | One pass over diverse data. Multiple epochs risk overfitting. | | Batch size | 1 | Single examples, immediate updates. | @@ -248,7 +247,7 @@ a few hundred MB. ## Components ### Built ✓ -- `optimizer.py` — Apollo optimizer (configurable rank, default 256) +- `optimizer.py` — Apollo optimizer (configurable rank) - `train_router.py` — /train endpoint, runs in vLLM process - `weight_mapping.py` — vLLM merged → HF separate views (validated) - `export_hook.py` — vLLM plugin hook for IPC handle export diff --git a/training/apollo_plugin/optimizer.py b/training/apollo_plugin/optimizer.py index 166ae3a..9abce94 100644 --- a/training/apollo_plugin/optimizer.py +++ b/training/apollo_plugin/optimizer.py @@ -8,9 +8,9 @@ Channel-wise or tensor-wise scaling is sufficient. Apollo approximates these scaling factors using a low-rank auxiliary optimizer state based on pure random projection. -Default rank=256 (full Apollo). ~10GB state for 27B model, <0.25% -compute overhead vs forward+backward. Captures gradient structure -across 100+ behavioral training examples per batch. +Default rank=64. ~2.5GB state for 27B model, <0.25% compute overhead +vs forward+backward. Sufficient for behavioral training with diverse +examples. Key implementation details from the paper: - Gradient scale factor α = √(n/r) compensates for projection ratio @@ -34,7 +34,7 @@ class Apollo(Optimizer): Args: params: model parameters lr: learning rate (default: 1e-4) - rank: projection rank (default: 256) + rank: projection rank (default: 64) betas: Adam momentum coefficients (default: (0.9, 0.999)) eps: numerical stability term (default: 1e-8) weight_decay: decoupled weight decay (default: 0.01) @@ -46,7 +46,7 @@ class Apollo(Optimizer): Set to None to disable. """ - def __init__(self, params, lr=1e-4, rank=256, betas=(0.9, 0.999), + def __init__(self, params, lr=1e-4, rank=64, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01, warmup_steps=0, scale=None, proj_refresh=200, norm_growth_limit=1.01): defaults = dict(lr=lr, rank=rank, betas=betas, eps=eps, diff --git a/training/apollo_plugin/train_router.py b/training/apollo_plugin/train_router.py index 4857162..3a35119 100644 --- a/training/apollo_plugin/train_router.py +++ b/training/apollo_plugin/train_router.py @@ -42,6 +42,7 @@ _initialized: bool = False _optimizer: Any = None # Persisted Apollo optimizer OPTIMIZER_STATE_PATH = "/tmp/apollo_optimizer_state.pt" +DEFAULT_RANK = 64 def _load_training_model() -> nn.Module: @@ -150,7 +151,7 @@ def _get_or_create_optimizer(model: nn.Module, config: dict[str, Any]): apollo_params, standard_params = [], [] for p in model.parameters(): if p.requires_grad: - if p.ndim >= 2 and min(p.shape) >= 256: + if p.ndim >= 2 and min(p.shape) >= DEFAULT_RANK: apollo_params.append(p) else: standard_params.append(p) @@ -168,7 +169,7 @@ def _get_or_create_optimizer(model: nn.Module, config: dict[str, Any]): _optimizer = Apollo( groups, lr=config.get('lr', 1e-5), - rank=config.get('rank', 256), + rank=config.get('rank', DEFAULT_RANK), betas=tuple(config.get('betas', (0.9, 0.999))), eps=config.get('eps', 1e-8), weight_decay=config.get('weight_decay', 0.01),