"""Fit a functional form to the LN γ trajectory across layers; derive the effective attention temperature T(L) from known coupling formulas. Rules of what scales with depth (from literature): DeepNorm: α_dec = (2M)^(1/4), β_dec = (8M)^(-1/4). Same per layer — does NOT depend on layer index l. The free variation across layers has to live in LN γ. Depth-μP: block multiplier a/√L, LR η/√L. Same per layer. So γ(L) is the family carrying the per-layer schedule. Try fitting forms: γ(L) = a · L^b (power law in layer index) γ(L) = a · exp(b·L) (exponential) γ(L) = a + b·L (linear) γ(L) = a + b·L^c (free c) (power law with free exponent) Report fit quality (R², residual statistics), and for the best fit, compute the derived T(L) curve. """ import json import numpy as np from math import log, exp def fit_power(L, y): """y ≈ a · L^b → log y ≈ log a + b log L.""" mask = (L > 0) & (y > 0) lx, ly = np.log(L[mask]), np.log(y[mask]) b, loga = np.polyfit(lx, ly, 1) yhat = np.exp(loga) * (L**b) r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() return {"form": "a*L^b", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat} def fit_exponential(L, y): """y ≈ a · exp(b·L) → log y ≈ log a + b·L.""" mask = y > 0 b, loga = np.polyfit(L[mask], np.log(y[mask]), 1) yhat = np.exp(loga) * np.exp(b * L) r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() return {"form": "a*exp(b*L)", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat} def fit_linear(L, y): b, a = np.polyfit(L, y, 1) yhat = a + b * L r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() return {"form": "a+b*L", "a": float(a), "b": float(b), "r2": float(r2), "yhat": yhat} def fit_piecewise_two(L, y): """Best split point L* and linear fits on each half (log-space).""" best = None for Ls in range(3, len(L) - 3): mA, mB = L < Ls, L >= Ls if (y[mA] <= 0).any() or (y[mB] <= 0).any(): continue bA, aA = np.polyfit(L[mA], np.log(y[mA]), 1) bB, aB = np.polyfit(L[mB], np.log(y[mB]), 1) yhat = np.where(mA, np.exp(aA + bA * L), np.exp(aB + bB * L)) r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() if best is None or r2 > best["r2"]: best = {"form": f"piecewise-exp-split@L={Ls}", "split": int(Ls), "a1": float(np.exp(aA)), "b1": float(bA), "a2": float(np.exp(aB)), "b2": float(bB), "r2": float(r2), "yhat": yhat} return best def main(): d = json.load(open("/tmp/qwen3-4b-null.json")) scales = d["scales"] num_layers = len(scales["input_ln"]) L = np.arange(num_layers, dtype=float) families_of_interest = ["input_ln", "post_attn_ln", "q_norm", "k_norm", "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] print("=" * 72) print("γ-trajectory fits per family (Qwen3-4B, 36 layers)") print("=" * 72) for fam in families_of_interest: y = np.array(scales[fam], dtype=float) print(f"\n--- {fam} ---") print(f" L=0: {y[0]:.3f} L=35: {y[-1]:.3f} ratio: {y[-1]/y[0]:+.2f}×") fits = [ fit_linear(L, y), fit_power(L + 1, y), # L+1 so L=0 doesn't explode log fit_exponential(L, y), fit_piecewise_two(L + 1, y), ] for f in fits: if f is None: continue extras = "" if "b" in f: extras = f" (a={f['a']:.3g}, b={f['b']:+.4f})" elif "split" in f: extras = f" (split={f['split']}, b1={f['b1']:+.4f}, b2={f['b2']:+.4f})" print(f" {f['form']:<32} R²={f['r2']:+.4f}{extras}") # For input_ln specifically: plot the curve (text) and derive T(L) y = np.array(scales["input_ln"], dtype=float) print("\n" + "=" * 72) print("input_ln γ magnitude across layers (the schedule signal)") print("=" * 72) print(f" {'L':>3} {'γ_L':>12} {'γ_L / γ_0':>10} {'log γ_L':>10}") for l_idx in range(num_layers): print(f" {l_idx:>3} {y[l_idx]:>12.3f} {y[l_idx]/y[0]:>10.3f} {log(y[l_idx]):>+10.4f}") # Classical SA schedules for comparison # - Linear: T(k) = T0 - k * (T0 - Tf)/N # - Exponential / Kirkpatrick: T(k) = T0 * α^k # - Logarithmic / Hajek: T(k) = c / log(k+2) # For γ (which grows = temperature drops, since larger γ → sharper attention): # γ growing corresponds to T cooling print("\n" + "=" * 72) print("Derived attention-temperature T(L) interpretation") print("=" * 72) print(" Attention logit ∝ (γ * W_Q * W_K * ||residual||²) / √d_head.") print(" With γ_L the schedule dial and other factors ~constant across layers,") print(" effective attention temperature T(L) ∝ 1/γ(L).") print(f"\n T(L)/T(0) = γ(0)/γ(L):") print(f" {'L':>3} {'T(L)/T(0)':>10} (smaller = cooler = sharper attention)") for l_idx in range(num_layers): print(f" {l_idx:>3} {y[0]/y[l_idx]:>10.4f}") # Comparison with classical SA cooling laws: # Kirkpatrick: T(L) = T0 · α^L → log T(L) = log T0 + L log α logT = -np.log(y / y[0]) # because T ∝ 1/γ b_kirk, a_kirk = np.polyfit(L, logT, 1) # Hajek (log-cooling): T(L) = c/log(L+2) # Predicts: log T = log c - log(log(L+2)) # Fit T(L) to c / log(L+c2) print(f"\n Kirkpatrick-law fit (exponential cooling):") print(f" log T(L) = {a_kirk:+.3f} + {b_kirk:+.4f} * L → T(L) = exp({a_kirk:+.3f}) · exp({b_kirk:+.4f}·L)") logT_hat = a_kirk + b_kirk * L r2_kirk = 1 - ((logT - logT_hat)**2).sum() / ((logT - logT.mean())**2).sum() print(f" R² (in log space) = {r2_kirk:+.4f} — ideally ≈ 1 if cooling is pure exponential") if __name__ == "__main__": main()