Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com>
141 lines
5.2 KiB
Python
141 lines
5.2 KiB
Python
"""Map between vLLM's merged weight layout and HuggingFace's separate layout.
|
|
|
|
vLLM merges weights for efficiency:
|
|
in_proj_qkv + in_proj_z → in_proj_qkvz [key_dim*2 + value_dim*2, hidden]
|
|
in_proj_b + in_proj_a → in_proj_ba [num_v_heads*2, hidden]
|
|
gate_proj + up_proj → gate_up_proj [intermediate*2, hidden]
|
|
|
|
This module creates HF-compatible parameter views that point to the same
|
|
GPU memory as vLLM's merged tensors. No copies — views share storage.
|
|
"""
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
|
|
|
|
# Qwen3.5-27B dimensions
|
|
HIDDEN = 5120
|
|
NUM_K_HEADS = 16
|
|
NUM_V_HEADS = 48
|
|
HEAD_K_DIM = 128
|
|
HEAD_V_DIM = 128
|
|
KEY_DIM = NUM_K_HEADS * HEAD_K_DIM # 2048
|
|
VALUE_DIM = NUM_V_HEADS * HEAD_V_DIM # 6144
|
|
INTERMEDIATE = 17408
|
|
NUM_LAYERS = 64
|
|
CONV_KERNEL = 4
|
|
CONV_DIM = KEY_DIM * 2 + VALUE_DIM # 10240
|
|
|
|
|
|
def vllm_to_hf_views(vllm_params: dict[str, torch.Tensor]
|
|
) -> dict[str, torch.Tensor]:
|
|
"""Create HF-compatible parameter views from vLLM merged weights.
|
|
|
|
Returns a dict of HF-style parameter names → tensor views.
|
|
The views share GPU memory with the vLLM tensors — no copies.
|
|
"""
|
|
hf_params = {}
|
|
|
|
for name, tensor in vllm_params.items():
|
|
# Pass through non-merged params unchanged
|
|
if 'in_proj_qkvz' not in name and \
|
|
'in_proj_ba' not in name and \
|
|
'gate_up_proj' not in name:
|
|
hf_params[name] = tensor
|
|
continue
|
|
|
|
# Split merged projections into HF-style separate weights
|
|
if 'in_proj_qkvz' in name:
|
|
# [key_dim*2 + value_dim*2, hidden] → qkv + z
|
|
prefix = name.replace('in_proj_qkvz', '')
|
|
qkv = tensor[:KEY_DIM * 2 + VALUE_DIM] # [key_dim*2 + value_dim, hidden]
|
|
z = tensor[KEY_DIM * 2 + VALUE_DIM:] # [value_dim, hidden]
|
|
hf_params[prefix + 'in_proj_qkv.weight'] = qkv
|
|
hf_params[prefix + 'in_proj_z.weight'] = z
|
|
|
|
elif 'in_proj_ba' in name:
|
|
# [num_v_heads*2, hidden] → b + a
|
|
prefix = name.replace('in_proj_ba', '')
|
|
b = tensor[:NUM_V_HEADS] # [num_v_heads, hidden]
|
|
a = tensor[NUM_V_HEADS:] # [num_v_heads, hidden]
|
|
hf_params[prefix + 'in_proj_b.weight'] = b
|
|
hf_params[prefix + 'in_proj_a.weight'] = a
|
|
|
|
elif 'gate_up_proj' in name:
|
|
# [intermediate*2, hidden] → gate + up
|
|
prefix = name.replace('gate_up_proj', '')
|
|
gate = tensor[:INTERMEDIATE] # [intermediate, hidden]
|
|
up = tensor[INTERMEDIATE:] # [intermediate, hidden]
|
|
hf_params[prefix + 'gate_proj.weight'] = gate
|
|
hf_params[prefix + 'up_proj.weight'] = up
|
|
|
|
return hf_params
|
|
|
|
|
|
def load_hf_model_with_vllm_weights(
|
|
vllm_params: dict[str, torch.Tensor],
|
|
model_path: str,
|
|
device: str = "cuda:0",
|
|
) -> nn.Module:
|
|
"""Load HF Qwen3.5 model with weights pointing to vLLM's GPU memory.
|
|
|
|
1. Creates HF-compatible views from vLLM's merged weights
|
|
2. Instantiates the HF model with empty weights
|
|
3. Replaces model parameters with the views
|
|
4. Returns model ready for forward+backward (autograd enabled)
|
|
"""
|
|
from transformers import AutoModelForCausalLM, AutoConfig
|
|
|
|
# Create HF-compatible views
|
|
hf_params = vllm_to_hf_views(vllm_params)
|
|
|
|
# Load config
|
|
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
|
|
|
# Create model with empty weights (no disk I/O)
|
|
with torch.device('meta'):
|
|
model = AutoModelForCausalLM.from_config(
|
|
config, trust_remote_code=True)
|
|
|
|
# Replace parameters with views into vLLM memory
|
|
replaced = 0
|
|
missing = []
|
|
for name, param in model.named_parameters():
|
|
if name in hf_params:
|
|
# Replace with view (shared GPU memory)
|
|
parts = name.rsplit('.', 1)
|
|
parent = model
|
|
for part in parts[0].split('.'):
|
|
parent = getattr(parent, part)
|
|
setattr(parent, parts[1],
|
|
nn.Parameter(hf_params[name], requires_grad=True))
|
|
replaced += 1
|
|
else:
|
|
missing.append(name)
|
|
|
|
print(f"Replaced {replaced} parameters with vLLM memory views")
|
|
if missing:
|
|
print(f"Missing {len(missing)} parameters: {missing[:5]}...")
|
|
|
|
model.train()
|
|
return model
|
|
|
|
|
|
def validate_views(vllm_params: dict[str, torch.Tensor],
|
|
hf_params: dict[str, torch.Tensor]):
|
|
"""Verify that HF views share storage with vLLM tensors."""
|
|
for vllm_name, vllm_tensor in vllm_params.items():
|
|
if 'in_proj_qkvz' in vllm_name:
|
|
prefix = vllm_name.replace('in_proj_qkvz.weight', '')
|
|
qkv_name = prefix + 'in_proj_qkv.weight'
|
|
z_name = prefix + 'in_proj_z.weight'
|
|
if qkv_name in hf_params:
|
|
assert hf_params[qkv_name].storage().data_ptr() == \
|
|
vllm_tensor.storage().data_ptr(), \
|
|
f"{qkv_name} doesn't share storage!"
|
|
if z_name in hf_params:
|
|
assert hf_params[z_name].storage().data_ptr() == \
|
|
vllm_tensor.storage().data_ptr(), \
|
|
f"{z_name} doesn't share storage!"
|
|
|
|
print("All views validated — shared storage confirmed")
|