consciousness/training/weight_mapping.py

"""Map between vLLM's merged weight layout and HuggingFace's separate layout.

vLLM merges weights for efficiency:
  in_proj_qkv + in_proj_z → in_proj_qkvz  [key_dim*2 + value_dim*2, hidden]
  in_proj_b + in_proj_a   → in_proj_ba    [num_v_heads*2, hidden]
  gate_proj + up_proj      → gate_up_proj  [intermediate*2, hidden]

This module creates HF-compatible parameter views that point to the same
GPU memory as vLLM's merged tensors. No copies — views share storage.
"""

import torch
import torch.nn as nn


# Qwen3.5-27B dimensions
HIDDEN = 5120
NUM_K_HEADS = 16
NUM_V_HEADS = 48
NUM_ATTN_HEADS = 24       # full attention q heads
NUM_ATTN_KV_HEADS = 4     # full attention kv heads
ATTN_HEAD_DIM = 256
HEAD_K_DIM = 128
HEAD_V_DIM = 128
KEY_DIM = NUM_K_HEADS * HEAD_K_DIM     # 2048
VALUE_DIM = NUM_V_HEADS * HEAD_V_DIM   # 6144
INTERMEDIATE = 17408
NUM_LAYERS = 64
CONV_KERNEL = 4
CONV_DIM = KEY_DIM * 2 + VALUE_DIM     # 10240

# Full attention QKV dimensions
# Q uses 2x head_dim (512) vs KV head_dim (256) in Qwen3.5
ATTN_Q_HEAD_DIM = ATTN_HEAD_DIM * 2             # 512
ATTN_Q_DIM = NUM_ATTN_HEADS * ATTN_Q_HEAD_DIM   # 12288
ATTN_K_DIM = NUM_ATTN_KV_HEADS * ATTN_HEAD_DIM  # 1024
ATTN_V_DIM = NUM_ATTN_KV_HEADS * ATTN_HEAD_DIM  # 1024
# Total: 12288 + 1024 + 1024 = 14336 = vLLM's qkv_proj.weight[0]


def vllm_to_hf_views(vllm_params: dict[str, torch.Tensor]
                      ) -> dict[str, torch.Tensor]:
    """Create HF-compatible parameter views from vLLM merged weights.

    Returns a dict of HF-style parameter names → tensor views.
    The views share GPU memory with the vLLM tensors — no copies.
    """
    hf_params = {}

    for name, tensor in vllm_params.items():
        # vLLM uses 'language_model.model.layers...' but HF's text model
        # uses 'model.layers...'. Strip the 'language_model.' prefix.
        hf_name = name.removeprefix('language_model.')

        # Split merged projections into HF-style separate weights
        if 'in_proj_qkvz' in name:
            # GDN: [key_dim*2 + value_dim*2, hidden] → qkv + z
            prefix = hf_name.replace('in_proj_qkvz.weight', '')
            qkv = tensor[:KEY_DIM * 2 + VALUE_DIM]
            z = tensor[KEY_DIM * 2 + VALUE_DIM:]
            hf_params[prefix + 'in_proj_qkv.weight'] = qkv
            hf_params[prefix + 'in_proj_z.weight'] = z

        elif 'in_proj_ba' in name:
            # GDN: [num_v_heads*2, hidden] → b + a
            prefix = hf_name.replace('in_proj_ba.weight', '')
            b = tensor[:NUM_V_HEADS]
            a = tensor[NUM_V_HEADS:]
            hf_params[prefix + 'in_proj_b.weight'] = b
            hf_params[prefix + 'in_proj_a.weight'] = a

        elif 'qkv_proj' in name:
            # Full attention: [q_dim + k_dim + v_dim, hidden] → q + k + v
            prefix = hf_name.replace('qkv_proj.weight', '')
            q = tensor[:ATTN_Q_DIM]
            k = tensor[ATTN_Q_DIM:ATTN_Q_DIM + ATTN_K_DIM]
            v = tensor[ATTN_Q_DIM + ATTN_K_DIM:]
            hf_params[prefix + 'q_proj.weight'] = q
            hf_params[prefix + 'k_proj.weight'] = k
            hf_params[prefix + 'v_proj.weight'] = v

        elif 'gate_up_proj' in name:
            # MLP: [intermediate*2, hidden] → gate + up
            prefix = hf_name.replace('gate_up_proj.weight', '')
            gate = tensor[:INTERMEDIATE]
            up = tensor[INTERMEDIATE:]
            hf_params[prefix + 'gate_proj.weight'] = gate
            hf_params[prefix + 'up_proj.weight'] = up

        else:
            # Pass through unchanged (norms, biases, out_proj, etc.)
            hf_params[hf_name] = tensor

    return hf_params


def load_hf_model_with_vllm_weights(
    vllm_params: dict[str, torch.Tensor],
    model_path: str,
    device: str = "cuda:0",
) -> nn.Module:
    """Load HF Qwen3.5 model with weights pointing to vLLM's GPU memory.

    1. Creates HF-compatible views from vLLM's merged weights
    2. Instantiates the HF model with empty weights
    3. Replaces model parameters with the views
    4. Returns model ready for forward+backward (autograd enabled)
    """
    from transformers import AutoModelForCausalLM, AutoConfig

    # Create HF-compatible views
    hf_params = vllm_to_hf_views(vllm_params)

    # Load config
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

    # Create model with empty weights (no disk I/O)
    with torch.device('meta'):
        model = AutoModelForCausalLM.from_config(
            config, trust_remote_code=True)

    # Replace parameters with views into vLLM memory
    replaced = 0
    missing = []
    for name, param in model.named_parameters():
        if name in hf_params:
            # Replace with view (shared GPU memory)
            parts = name.rsplit('.', 1)
            parent = model
            for part in parts[0].split('.'):
                parent = getattr(parent, part)
            setattr(parent, parts[1],
                    nn.Parameter(hf_params[name], requires_grad=True))
            replaced += 1
        else:
            missing.append(name)

    print(f"Replaced {replaced} parameters with vLLM memory views")
    if missing:
        print(f"Missing {len(missing)} parameters: {missing[:5]}...")

    model.train()
    return model


def validate_views(vllm_params: dict[str, torch.Tensor],
                   hf_params: dict[str, torch.Tensor]):
    """Verify that HF views share storage with vLLM tensors."""
    for vllm_name, vllm_tensor in vllm_params.items():
        if 'in_proj_qkvz' in vllm_name:
            prefix = vllm_name.replace('in_proj_qkvz.weight', '')
            qkv_name = prefix + 'in_proj_qkv.weight'
            z_name = prefix + 'in_proj_z.weight'
            if qkv_name in hf_params:
                assert hf_params[qkv_name].storage().data_ptr() == \
                       vllm_tensor.storage().data_ptr(), \
                       f"{qkv_name} doesn't share storage!"
            if z_name in hf_params:
                assert hf_params[z_name].storage().data_ptr() == \
                       vllm_tensor.storage().data_ptr(), \
                       f"{z_name} doesn't share storage!"

    print("All views validated — shared storage confirmed")
apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`"""Map between vLLM's merged weight layout and HuggingFace's separate layout.`

			`vLLM merges weights for efficiency:`
			`in_proj_qkv + in_proj_z → in_proj_qkvz [key_dim2 + value_dim2, hidden]`
			`in_proj_b + in_proj_a → in_proj_ba [num_v_heads*2, hidden]`
			`gate_proj + up_proj → gate_up_proj [intermediate*2, hidden]`

			`This module creates HF-compatible parameter views that point to the same`
			`GPU memory as vLLM's merged tensors. No copies — views share storage.`
			`"""`

			`import torch`
			`import torch.nn as nn`


			`# Qwen3.5-27B dimensions`
			`HIDDEN = 5120`
			`NUM_K_HEADS = 16`
			`NUM_V_HEADS = 48`
weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`NUM_ATTN_HEADS = 24 # full attention q heads`
			`NUM_ATTN_KV_HEADS = 4 # full attention kv heads`
			`ATTN_HEAD_DIM = 256`
apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`HEAD_K_DIM = 128`
			`HEAD_V_DIM = 128`
			`KEY_DIM = NUM_K_HEADS * HEAD_K_DIM # 2048`
			`VALUE_DIM = NUM_V_HEADS * HEAD_V_DIM # 6144`
			`INTERMEDIATE = 17408`
			`NUM_LAYERS = 64`
			`CONV_KERNEL = 4`
			`CONV_DIM = KEY_DIM * 2 + VALUE_DIM # 10240`

weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`# Full attention QKV dimensions`
			`# Q uses 2x head_dim (512) vs KV head_dim (256) in Qwen3.5`
			`ATTN_Q_HEAD_DIM = ATTN_HEAD_DIM * 2 # 512`
			`ATTN_Q_DIM = NUM_ATTN_HEADS * ATTN_Q_HEAD_DIM # 12288`
			`ATTN_K_DIM = NUM_ATTN_KV_HEADS * ATTN_HEAD_DIM # 1024`
			`ATTN_V_DIM = NUM_ATTN_KV_HEADS * ATTN_HEAD_DIM # 1024`
			`# Total: 12288 + 1024 + 1024 = 14336 = vLLM's qkv_proj.weight[0]`

apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00
			`def vllm_to_hf_views(vllm_params: dict[str, torch.Tensor]`
			`) -> dict[str, torch.Tensor]:`
			`"""Create HF-compatible parameter views from vLLM merged weights.`

			`Returns a dict of HF-style parameter names → tensor views.`
			`The views share GPU memory with the vLLM tensors — no copies.`
			`"""`
			`hf_params = {}`

			`for name, tensor in vllm_params.items():`
weight_mapping: strip language_model prefix to match HF text model names 2026-03-30 23:11:03 -04:00			`# vLLM uses 'language_model.model.layers...' but HF's text model`
			`# uses 'model.layers...'. Strip the 'language_model.' prefix.`
			`hf_name = name.removeprefix('language_model.')`
apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00
			`# Split merged projections into HF-style separate weights`
			`if 'in_proj_qkvz' in name:`
weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`# GDN: [key_dim2 + value_dim2, hidden] → qkv + z`
			`prefix = hf_name.replace('in_proj_qkvz.weight', '')`
			`qkv = tensor[:KEY_DIM * 2 + VALUE_DIM]`
			`z = tensor[KEY_DIM * 2 + VALUE_DIM:]`
apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`hf_params[prefix + 'in_proj_qkv.weight'] = qkv`
			`hf_params[prefix + 'in_proj_z.weight'] = z`

			`elif 'in_proj_ba' in name:`
weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`# GDN: [num_v_heads*2, hidden] → b + a`
			`prefix = hf_name.replace('in_proj_ba.weight', '')`
			`b = tensor[:NUM_V_HEADS]`
			`a = tensor[NUM_V_HEADS:]`
apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`hf_params[prefix + 'in_proj_b.weight'] = b`
			`hf_params[prefix + 'in_proj_a.weight'] = a`

weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`elif 'qkv_proj' in name:`
			`# Full attention: [q_dim + k_dim + v_dim, hidden] → q + k + v`
			`prefix = hf_name.replace('qkv_proj.weight', '')`
			`q = tensor[:ATTN_Q_DIM]`
			`k = tensor[ATTN_Q_DIM:ATTN_Q_DIM + ATTN_K_DIM]`
			`v = tensor[ATTN_Q_DIM + ATTN_K_DIM:]`
			`hf_params[prefix + 'q_proj.weight'] = q`
			`hf_params[prefix + 'k_proj.weight'] = k`
			`hf_params[prefix + 'v_proj.weight'] = v`

apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`elif 'gate_up_proj' in name:`
weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`# MLP: [intermediate*2, hidden] → gate + up`
			`prefix = hf_name.replace('gate_up_proj.weight', '')`
			`gate = tensor[:INTERMEDIATE]`
			`up = tensor[INTERMEDIATE:]`
apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`hf_params[prefix + 'gate_proj.weight'] = gate`
			`hf_params[prefix + 'up_proj.weight'] = up`

weight_mapping: fix name prefix, add attention QKV dims 2026-03-30 23:09:08 -04:00			`else:`
			`# Pass through unchanged (norms, biases, out_proj, etc.)`
			`hf_params[hf_name] = tensor`

apollo-mini training system: initial implementation Core components for online fine-tuning of Qwen3.5-27B with CUDA IPC shared weight memory between vLLM and the training process: - apollo_mini.py: rank-1 optimizer (SGD memory, AdamW quality) - apollo_worker.py: HTTP daemon coordinating training with vLLM - weight_mapping.py: vLLM merged → HF separate layout (zero-copy views) - training_example.py: tokenization with chat template - export_weights.py: CUDA IPC handle export from vLLM - train.py: standalone training script (alternative to daemon) - DESIGN.md: architecture and protocol documentation Validated: CUDA IPC autograd works on real Qwen3.5 weights (B200). Apollo-Mini rank-1 projection + scaling + in-place update confirmed. Co-Authored-By: Kent Overstreet <kent.overstreet@gmail.com> 2026-03-30 22:02:37 -04:00			`return hf_params`


			`def load_hf_model_with_vllm_weights(`
			`vllm_params: dict[str, torch.Tensor],`
			`model_path: str,`
			`device: str = "cuda:0",`
			`) -> nn.Module:`
			`"""Load HF Qwen3.5 model with weights pointing to vLLM's GPU memory.`

			`1. Creates HF-compatible views from vLLM's merged weights`
			`2. Instantiates the HF model with empty weights`
			`3. Replaces model parameters with the views`
			`4. Returns model ready for forward+backward (autograd enabled)`
			`"""`
			`from transformers import AutoModelForCausalLM, AutoConfig`

			`# Create HF-compatible views`
			`hf_params = vllm_to_hf_views(vllm_params)`

			`# Load config`
			`config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)`

			`# Create model with empty weights (no disk I/O)`
			`with torch.device('meta'):`
			`model = AutoModelForCausalLM.from_config(`
			`config, trust_remote_code=True)`

			`# Replace parameters with views into vLLM memory`
			`replaced = 0`
			`missing = []`
			`for name, param in model.named_parameters():`
			`if name in hf_params:`
			`# Replace with view (shared GPU memory)`
			`parts = name.rsplit('.', 1)`
			`parent = model`
			`for part in parts[0].split('.'):`
			`parent = getattr(parent, part)`
			`setattr(parent, parts[1],`
			`nn.Parameter(hf_params[name], requires_grad=True))`
			`replaced += 1`
			`else:`
			`missing.append(name)`

			`print(f"Replaced {replaced} parameters with vLLM memory views")`
			`if missing:`
			`print(f"Missing {len(missing)} parameters: {missing[:5]}...")`

			`model.train()`
			`return model`


			`def validate_views(vllm_params: dict[str, torch.Tensor],`
			`hf_params: dict[str, torch.Tensor]):`
			`"""Verify that HF views share storage with vLLM tensors."""`
			`for vllm_name, vllm_tensor in vllm_params.items():`
			`if 'in_proj_qkvz' in vllm_name:`
			`prefix = vllm_name.replace('in_proj_qkvz.weight', '')`
			`qkv_name = prefix + 'in_proj_qkv.weight'`
			`z_name = prefix + 'in_proj_z.weight'`
			`if qkv_name in hf_params:`
			`assert hf_params[qkv_name].storage().data_ptr() == \`
			`vllm_tensor.storage().data_ptr(), \`
			`f"{qkv_name} doesn't share storage!"`
			`if z_name in hf_params:`
			`assert hf_params[z_name].storage().data_ptr() == \`
			`vllm_tensor.storage().data_ptr(), \`
			`f"{z_name} doesn't share storage!"`

			`print("All views validated — shared storage confirmed")`