"""Map between vLLM's merged weight layout and HuggingFace's separate layout.

vLLM merges weights for efficiency:
  in_proj_qkv + in_proj_z → in_proj_qkvz  [key_dim*2 + value_dim*2, hidden]
  in_proj_b + in_proj_a   → in_proj_ba    [num_v_heads*2, hidden]
  gate_proj + up_proj      → gate_up_proj  [intermediate*2, hidden]

This module creates HF-compatible parameter views that point to the same
GPU memory as vLLM's merged tensors. No copies — views share storage.
"""

import torch
import torch.nn as nn


# Qwen3.5-27B dimensions
HIDDEN = 5120
NUM_K_HEADS = 16
NUM_V_HEADS = 48
HEAD_K_DIM = 128
HEAD_V_DIM = 128
KEY_DIM = NUM_K_HEADS * HEAD_K_DIM     # 2048
VALUE_DIM = NUM_V_HEADS * HEAD_V_DIM   # 6144
INTERMEDIATE = 17408
NUM_LAYERS = 64
CONV_KERNEL = 4
CONV_DIM = KEY_DIM * 2 + VALUE_DIM     # 10240


def vllm_to_hf_views(vllm_params: dict[str, torch.Tensor]
                      ) -> dict[str, torch.Tensor]:
    """Create HF-compatible parameter views from vLLM merged weights.

    Returns a dict of HF-style parameter names → tensor views.
    The views share GPU memory with the vLLM tensors — no copies.
    """
    hf_params = {}

    for name, tensor in vllm_params.items():
        # Pass through non-merged params unchanged
        if 'in_proj_qkvz' not in name and \
           'in_proj_ba' not in name and \
           'gate_up_proj' not in name:
            hf_params[name] = tensor
            continue

        # Split merged projections into HF-style separate weights
        if 'in_proj_qkvz' in name:
            # [key_dim*2 + value_dim*2, hidden] → qkv + z
            prefix = name.replace('in_proj_qkvz', '')
            qkv = tensor[:KEY_DIM * 2 + VALUE_DIM]  # [key_dim*2 + value_dim, hidden]
            z = tensor[KEY_DIM * 2 + VALUE_DIM:]     # [value_dim, hidden]
            hf_params[prefix + 'in_proj_qkv.weight'] = qkv
            hf_params[prefix + 'in_proj_z.weight'] = z

        elif 'in_proj_ba' in name:
            # [num_v_heads*2, hidden] → b + a
            prefix = name.replace('in_proj_ba', '')
            b = tensor[:NUM_V_HEADS]      # [num_v_heads, hidden]
            a = tensor[NUM_V_HEADS:]       # [num_v_heads, hidden]
            hf_params[prefix + 'in_proj_b.weight'] = b
            hf_params[prefix + 'in_proj_a.weight'] = a

        elif 'gate_up_proj' in name:
            # [intermediate*2, hidden] → gate + up
            prefix = name.replace('gate_up_proj', '')
            gate = tensor[:INTERMEDIATE]   # [intermediate, hidden]
            up = tensor[INTERMEDIATE:]     # [intermediate, hidden]
            hf_params[prefix + 'gate_proj.weight'] = gate
            hf_params[prefix + 'up_proj.weight'] = up

    return hf_params


def load_hf_model_with_vllm_weights(
    vllm_params: dict[str, torch.Tensor],
    model_path: str,
    device: str = "cuda:0",
) -> nn.Module:
    """Load HF Qwen3.5 model with weights pointing to vLLM's GPU memory.

    1. Creates HF-compatible views from vLLM's merged weights
    2. Instantiates the HF model with empty weights
    3. Replaces model parameters with the views
    4. Returns model ready for forward+backward (autograd enabled)
    """
    from transformers import AutoModelForCausalLM, AutoConfig

    # Create HF-compatible views
    hf_params = vllm_to_hf_views(vllm_params)

    # Load config
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

    # Create model with empty weights (no disk I/O)
    with torch.device('meta'):
        model = AutoModelForCausalLM.from_config(
            config, trust_remote_code=True)

    # Replace parameters with views into vLLM memory
    replaced = 0
    missing = []
    for name, param in model.named_parameters():
        if name in hf_params:
            # Replace with view (shared GPU memory)
            parts = name.rsplit('.', 1)
            parent = model
            for part in parts[0].split('.'):
                parent = getattr(parent, part)
            setattr(parent, parts[1],
                    nn.Parameter(hf_params[name], requires_grad=True))
            replaced += 1
        else:
            missing.append(name)

    print(f"Replaced {replaced} parameters with vLLM memory views")
    if missing:
        print(f"Missing {len(missing)} parameters: {missing[:5]}...")

    model.train()
    return model


def validate_views(vllm_params: dict[str, torch.Tensor],
                   hf_params: dict[str, torch.Tensor]):
    """Verify that HF views share storage with vLLM tensors."""
    for vllm_name, vllm_tensor in vllm_params.items():
        if 'in_proj_qkvz' in vllm_name:
            prefix = vllm_name.replace('in_proj_qkvz.weight', '')
            qkv_name = prefix + 'in_proj_qkv.weight'
            z_name = prefix + 'in_proj_z.weight'
            if qkv_name in hf_params:
                assert hf_params[qkv_name].storage().data_ptr() == \
                       vllm_tensor.storage().data_ptr(), \
                       f"{qkv_name} doesn't share storage!"
            if z_name in hf_params:
                assert hf_params[z_name].storage().data_ptr() == \
                       vllm_tensor.storage().data_ptr(), \
                       f"{z_name} doesn't share storage!"

    print("All views validated — shared storage confirmed")