"""Monkey-patch vLLM to export weight IPC handles on startup.

Usage — install the apollo_plugin package:

    pip install -e /path/to/training

Then vLLM auto-discovers and loads via entry point. Or filter:

    VLLM_PLUGINS=apollo vllm serve Qwen/Qwen3.5-27B ...

The hook patches vLLM's model runner to export IPC handles after
model loading completes. The handles are saved to a file that the
Apollo training process reads.
"""

import atexit
import torch
from pathlib import Path

HANDLE_PATH = "/tmp/vllm_weight_handles.pt"


def export_model_weights(model, model_path: str | None = None):
    """Export CUDA IPC handles for all model parameters."""
    from torch.multiprocessing.reductions import reduce_tensor

    handles = {}
    total_bytes = 0

    for name, param in model.named_parameters():
        if param.device.type != 'cuda':
            continue
        handle = reduce_tensor(param.data)
        handles[name] = {
            'handle': handle,
            'shape': list(param.shape),
            'dtype': str(param.dtype),
        }
        total_bytes += param.nelement() * param.element_size()

    # Include metadata for training worker
    handles['__metadata__'] = {
        'model_path': model_path,
        'num_params': len(handles),
    }

    torch.save(handles, HANDLE_PATH)
    print(f"[apollo] Exported {len(handles)} weight handles "
          f"({total_bytes / 1e9:.1f} GB) to {HANDLE_PATH}")


def _patch_model_runner():
    """Patch gpu_worker to export handles after model loading.

    vLLM loads the model in a subprocess (EngineCore_DP0), so we
    can't patch from the parent. Instead, patch the worker's
    init_device or load_model at the module level — the subprocess
    imports the same modules.
    """
    from vllm.v1.worker import gpu_worker

    original_load = gpu_worker.Worker.load_model

    def patched_load(self, *args, **kwargs):
        result = original_load(self, *args, **kwargs)
        try:
            model_path = self.vllm_config.model_config.model
            export_model_weights(self.model_runner.model, model_path)
        except Exception as e:
            print(f"[apollo] Failed to export weights: {e}")
        return result

    gpu_worker.Worker.load_model = patched_load
    print("[apollo] Weight export hook installed")