"""Fit a functional form to the LN γ trajectory across layers; derive the
effective attention temperature T(L) from known coupling formulas.

Rules of what scales with depth (from literature):
  DeepNorm: α_dec = (2M)^(1/4), β_dec = (8M)^(-1/4). Same per layer — does
    NOT depend on layer index l. The free variation across layers has to
    live in LN γ.
  Depth-μP: block multiplier a/√L, LR η/√L. Same per layer.
  So γ(L) is the family carrying the per-layer schedule.

Try fitting forms:
  γ(L) = a · L^b              (power law in layer index)
  γ(L) = a · exp(b·L)         (exponential)
  γ(L) = a + b·L              (linear)
  γ(L) = a + b·L^c (free c)  (power law with free exponent)

Report fit quality (R², residual statistics), and for the best fit, compute
the derived T(L) curve.
"""
import json
import numpy as np
from math import log, exp


def fit_power(L, y):
    """y ≈ a · L^b  →  log y ≈ log a + b log L."""
    mask = (L > 0) & (y > 0)
    lx, ly = np.log(L[mask]), np.log(y[mask])
    b, loga = np.polyfit(lx, ly, 1)
    yhat = np.exp(loga) * (L**b)
    r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
    return {"form": "a*L^b", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat}


def fit_exponential(L, y):
    """y ≈ a · exp(b·L)  →  log y ≈ log a + b·L."""
    mask = y > 0
    b, loga = np.polyfit(L[mask], np.log(y[mask]), 1)
    yhat = np.exp(loga) * np.exp(b * L)
    r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
    return {"form": "a*exp(b*L)", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat}


def fit_linear(L, y):
    b, a = np.polyfit(L, y, 1)
    yhat = a + b * L
    r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
    return {"form": "a+b*L", "a": float(a), "b": float(b), "r2": float(r2), "yhat": yhat}


def fit_piecewise_two(L, y):
    """Best split point L* and linear fits on each half (log-space)."""
    best = None
    for Ls in range(3, len(L) - 3):
        mA, mB = L < Ls, L >= Ls
        if (y[mA] <= 0).any() or (y[mB] <= 0).any():
            continue
        bA, aA = np.polyfit(L[mA], np.log(y[mA]), 1)
        bB, aB = np.polyfit(L[mB], np.log(y[mB]), 1)
        yhat = np.where(mA, np.exp(aA + bA * L), np.exp(aB + bB * L))
        r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
        if best is None or r2 > best["r2"]:
            best = {"form": f"piecewise-exp-split@L={Ls}", "split": int(Ls),
                    "a1": float(np.exp(aA)), "b1": float(bA),
                    "a2": float(np.exp(aB)), "b2": float(bB),
                    "r2": float(r2), "yhat": yhat}
    return best


def main():
    d = json.load(open("/tmp/qwen3-4b-null.json"))
    scales = d["scales"]
    num_layers = len(scales["input_ln"])
    L = np.arange(num_layers, dtype=float)

    families_of_interest = ["input_ln", "post_attn_ln", "q_norm", "k_norm",
                            "q_proj", "k_proj", "v_proj", "o_proj",
                            "gate_proj", "up_proj", "down_proj"]

    print("=" * 72)
    print("γ-trajectory fits per family (Qwen3-4B, 36 layers)")
    print("=" * 72)

    for fam in families_of_interest:
        y = np.array(scales[fam], dtype=float)
        print(f"\n--- {fam} ---")
        print(f"  L=0:  {y[0]:.3f}    L=35: {y[-1]:.3f}    ratio: {y[-1]/y[0]:+.2f}×")
        fits = [
            fit_linear(L, y),
            fit_power(L + 1, y),           # L+1 so L=0 doesn't explode log
            fit_exponential(L, y),
            fit_piecewise_two(L + 1, y),
        ]
        for f in fits:
            if f is None:
                continue
            extras = ""
            if "b" in f:
                extras = f" (a={f['a']:.3g}, b={f['b']:+.4f})"
            elif "split" in f:
                extras = f" (split={f['split']}, b1={f['b1']:+.4f}, b2={f['b2']:+.4f})"
            print(f"  {f['form']:<32} R²={f['r2']:+.4f}{extras}")

    # For input_ln specifically: plot the curve (text) and derive T(L)
    y = np.array(scales["input_ln"], dtype=float)
    print("\n" + "=" * 72)
    print("input_ln γ magnitude across layers (the schedule signal)")
    print("=" * 72)
    print(f"  {'L':>3}  {'γ_L':>12}  {'γ_L / γ_0':>10}  {'log γ_L':>10}")
    for l_idx in range(num_layers):
        print(f"  {l_idx:>3}  {y[l_idx]:>12.3f}  {y[l_idx]/y[0]:>10.3f}  {log(y[l_idx]):>+10.4f}")

    # Classical SA schedules for comparison
    # - Linear: T(k) = T0 - k * (T0 - Tf)/N
    # - Exponential / Kirkpatrick: T(k) = T0 * α^k
    # - Logarithmic / Hajek: T(k) = c / log(k+2)
    # For γ (which grows = temperature drops, since larger γ → sharper attention):
    # γ growing corresponds to T cooling
    print("\n" + "=" * 72)
    print("Derived attention-temperature T(L) interpretation")
    print("=" * 72)
    print("  Attention logit ∝ (γ * W_Q * W_K * ||residual||²) / √d_head.")
    print("  With γ_L the schedule dial and other factors ~constant across layers,")
    print("  effective attention temperature T(L) ∝ 1/γ(L).")
    print(f"\n  T(L)/T(0) = γ(0)/γ(L):")
    print(f"  {'L':>3}  {'T(L)/T(0)':>10}  (smaller = cooler = sharper attention)")
    for l_idx in range(num_layers):
        print(f"  {l_idx:>3}  {y[0]/y[l_idx]:>10.4f}")

    # Comparison with classical SA cooling laws:
    # Kirkpatrick: T(L) = T0 · α^L  →  log T(L) = log T0 + L log α
    logT = -np.log(y / y[0])   # because T ∝ 1/γ
    b_kirk, a_kirk = np.polyfit(L, logT, 1)
    # Hajek (log-cooling): T(L) = c/log(L+2)
    # Predicts: log T = log c - log(log(L+2))
    # Fit T(L) to c / log(L+c2)
    print(f"\n  Kirkpatrick-law fit (exponential cooling):")
    print(f"    log T(L) = {a_kirk:+.3f} + {b_kirk:+.4f} * L  →  T(L) = exp({a_kirk:+.3f}) · exp({b_kirk:+.4f}·L)")
    logT_hat = a_kirk + b_kirk * L
    r2_kirk = 1 - ((logT - logT_hat)**2).sum() / ((logT - logT.mean())**2).sum()
    print(f"    R² (in log space) = {r2_kirk:+.4f}  — ideally ≈ 1 if cooling is pure exponential")


if __name__ == "__main__":
    main()