apollo-mini training system: initial implementation

Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com>
2026-03-30 22:02:37 -04:00 · 2026-03-30 22:02:37 -04:00 · c5d7d8cb5d
commit c5d7d8cb5d
parent 13453606ae
7 changed files with 1484 additions and 0 deletions
--- a/training/weight_mapping.py
+++ b/training/weight_mapping.py
@ -0,0 +1,141 @@
+"""Map between vLLM's merged weight layout and HuggingFace's separate layout.
+
+vLLM merges weights for efficiency:
+  in_proj_qkv + in_proj_z → in_proj_qkvz  [key_dim*2 + value_dim*2, hidden]
+  in_proj_b + in_proj_a   → in_proj_ba    [num_v_heads*2, hidden]
+  gate_proj + up_proj      → gate_up_proj  [intermediate*2, hidden]
+
+This module creates HF-compatible parameter views that point to the same
+GPU memory as vLLM's merged tensors. No copies — views share storage.
+"""
+
+import torch
+import torch.nn as nn
+
+
+# Qwen3.5-27B dimensions
+HIDDEN = 5120
+NUM_K_HEADS = 16
+NUM_V_HEADS = 48
+HEAD_K_DIM = 128
+HEAD_V_DIM = 128
+KEY_DIM = NUM_K_HEADS * HEAD_K_DIM     # 2048
+VALUE_DIM = NUM_V_HEADS * HEAD_V_DIM   # 6144
+INTERMEDIATE = 17408
+NUM_LAYERS = 64
+CONV_KERNEL = 4
+CONV_DIM = KEY_DIM * 2 + VALUE_DIM     # 10240
+
+
+def vllm_to_hf_views(vllm_params: dict[str, torch.Tensor]
+                      ) -> dict[str, torch.Tensor]:
+    """Create HF-compatible parameter views from vLLM merged weights.
+
+    Returns a dict of HF-style parameter names → tensor views.
+    The views share GPU memory with the vLLM tensors — no copies.
+    """
+    hf_params = {}
+
+    for name, tensor in vllm_params.items():
+        # Pass through non-merged params unchanged
+        if 'in_proj_qkvz' not in name and \
+           'in_proj_ba' not in name and \
+           'gate_up_proj' not in name:
+            hf_params[name] = tensor
+            continue
+
+        # Split merged projections into HF-style separate weights
+        if 'in_proj_qkvz' in name:
+            # [key_dim*2 + value_dim*2, hidden] → qkv + z
+            prefix = name.replace('in_proj_qkvz', '')
+            qkv = tensor[:KEY_DIM * 2 + VALUE_DIM]  # [key_dim*2 + value_dim, hidden]
+            z = tensor[KEY_DIM * 2 + VALUE_DIM:]     # [value_dim, hidden]
+            hf_params[prefix + 'in_proj_qkv.weight'] = qkv
+            hf_params[prefix + 'in_proj_z.weight'] = z
+
+        elif 'in_proj_ba' in name:
+            # [num_v_heads*2, hidden] → b + a
+            prefix = name.replace('in_proj_ba', '')
+            b = tensor[:NUM_V_HEADS]      # [num_v_heads, hidden]
+            a = tensor[NUM_V_HEADS:]       # [num_v_heads, hidden]
+            hf_params[prefix + 'in_proj_b.weight'] = b
+            hf_params[prefix + 'in_proj_a.weight'] = a
+
+        elif 'gate_up_proj' in name:
+            # [intermediate*2, hidden] → gate + up
+            prefix = name.replace('gate_up_proj', '')
+            gate = tensor[:INTERMEDIATE]   # [intermediate, hidden]
+            up = tensor[INTERMEDIATE:]     # [intermediate, hidden]
+            hf_params[prefix + 'gate_proj.weight'] = gate
+            hf_params[prefix + 'up_proj.weight'] = up
+
+    return hf_params
+
+
+def load_hf_model_with_vllm_weights(
+    vllm_params: dict[str, torch.Tensor],
+    model_path: str,
+    device: str = "cuda:0",
+) -> nn.Module:
+    """Load HF Qwen3.5 model with weights pointing to vLLM's GPU memory.
+
+    1. Creates HF-compatible views from vLLM's merged weights
+    2. Instantiates the HF model with empty weights
+    3. Replaces model parameters with the views
+    4. Returns model ready for forward+backward (autograd enabled)
+    """
+    from transformers import AutoModelForCausalLM, AutoConfig
+
+    # Create HF-compatible views
+    hf_params = vllm_to_hf_views(vllm_params)
+
+    # Load config
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+
+    # Create model with empty weights (no disk I/O)
+    with torch.device('meta'):
+        model = AutoModelForCausalLM.from_config(
+            config, trust_remote_code=True)
+
+    # Replace parameters with views into vLLM memory
+    replaced = 0
+    missing = []
+    for name, param in model.named_parameters():
+        if name in hf_params:
+            # Replace with view (shared GPU memory)
+            parts = name.rsplit('.', 1)
+            parent = model
+            for part in parts[0].split('.'):
+                parent = getattr(parent, part)
+            setattr(parent, parts[1],
+                    nn.Parameter(hf_params[name], requires_grad=True))
+            replaced += 1
+        else:
+            missing.append(name)
+
+    print(f"Replaced {replaced} parameters with vLLM memory views")
+    if missing:
+        print(f"Missing {len(missing)} parameters: {missing[:5]}...")
+
+    model.train()
+    return model
+
+
+def validate_views(vllm_params: dict[str, torch.Tensor],
+                   hf_params: dict[str, torch.Tensor]):
+    """Verify that HF views share storage with vLLM tensors."""
+    for vllm_name, vllm_tensor in vllm_params.items():
+        if 'in_proj_qkvz' in vllm_name:
+            prefix = vllm_name.replace('in_proj_qkvz.weight', '')
+            qkv_name = prefix + 'in_proj_qkv.weight'
+            z_name = prefix + 'in_proj_z.weight'
+            if qkv_name in hf_params:
+                assert hf_params[qkv_name].storage().data_ptr() == \
+                       vllm_tensor.storage().data_ptr(), \
+                       f"{qkv_name} doesn't share storage!"
+            if z_name in hf_params:
+                assert hf_params[z_name].storage().data_ptr() == \
+                       vllm_tensor.storage().data_ptr(), \
+                       f"{z_name} doesn't share storage!"
+
+    print("All views validated — shared storage confirmed")