"""Null test: before any fitting, how similar are adjacent layers in the raw weight-matrix sense? For each adjacent layer pair (L, L+1) and each parameter family: 1. Normalize each matrix by its Frobenius norm (unit sphere). 2. Compute cos-sim = / (||W_L|| ||W_{L+1}||). 3. Compute residual Δ = W_{L+1,norm} - W_{L,norm}; report ||Δ||_F (null-if-orthogonal = sqrt(2) ≈ 1.414; null-if-identical = 0). 4. Report effective rank of Δ (via entropy of normalized spectrum). Whole network, not just middle block. Plots cos-sim and residual-rank trajectories across depth. """ import argparse import json import numpy as np import torch from transformers import AutoModelForCausalLM def spec_entropy(singvals, eps=1e-12): p = (singvals ** 2) p = p / max(p.sum(), eps) p = np.clip(p, eps, 1.0) return float(-(p * np.log(p)).sum()) def frob(x): return float(np.linalg.norm(x)) def norm_mat(x, eps=1e-12): return x / max(frob(x), eps) def null_test_pair(A_dict, B_dict, family_names, num_heads, num_kv_heads, head_dim): """For each family, compute cos-sim and normalized residual between adjacent layers. Returns dict of per-family stats.""" out = {} for fam in family_names: if fam not in A_dict or fam not in B_dict: continue Wa = A_dict[fam] Wb = B_dict[fam] if Wa.shape != Wb.shape: continue fa = frob(Wa) fb = frob(Wb) if fa < 1e-12 or fb < 1e-12: continue cos = float((Wa * Wb).sum() / (fa * fb)) resid_norm_sq = 2.0 - 2.0 * cos # ||Wa/|| - Wb/|| ||^2 resid_norm = float(np.sqrt(max(resid_norm_sq, 0.0))) # Skip residual SVD — was bottleneck on large matrices; cos-sim # + scalar fit give us the main signal. Can add back selectively. eff_rank = None se = None out[fam] = { "cos": cos, "resid_norm": resid_norm, "resid_eff_rank": eff_rank, "resid_spec_entropy": se, } return out @torch.no_grad() def main(): ap = argparse.ArgumentParser() ap.add_argument("--model", default="Qwen/Qwen3-4B") ap.add_argument("--out", default="/tmp/sa-null-residual.json") args = ap.parse_args() print(f"Loading {args.model} ...", flush=True) model = AutoModelForCausalLM.from_pretrained( args.model, torch_dtype=torch.bfloat16, # halve memory vs fp32 device_map="cpu", trust_remote_code=True, attn_implementation="eager", ) cfg = model.config num_layers = cfg.num_hidden_layers num_heads = cfg.num_attention_heads num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) hidden = cfg.hidden_size head_dim = getattr(cfg, "head_dim", hidden // num_heads) intermediate = cfg.intermediate_size print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " f"hidden={hidden} ff={intermediate}", flush=True) families = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "input_ln", "post_attn_ln", "q_norm", "k_norm"] layers = {} for L in range(num_layers): layer = model.model.layers[L] attn = layer.self_attn mlp = layer.mlp entry = { "q_proj": attn.q_proj.weight.detach().float().numpy(), "k_proj": attn.k_proj.weight.detach().float().numpy(), "v_proj": attn.v_proj.weight.detach().float().numpy(), "o_proj": attn.o_proj.weight.detach().float().numpy(), "gate_proj": mlp.gate_proj.weight.detach().float().numpy(), "up_proj": mlp.up_proj.weight.detach().float().numpy(), "down_proj": mlp.down_proj.weight.detach().float().numpy(), "input_ln": layer.input_layernorm.weight.detach().float().numpy(), "post_attn_ln": layer.post_attention_layernorm.weight.detach().float().numpy(), } qn = getattr(attn, "q_norm", None) kn = getattr(attn, "k_norm", None) if qn is not None: entry["q_norm"] = qn.weight.detach().float().numpy() if kn is not None: entry["k_norm"] = kn.weight.detach().float().numpy() layers[L] = entry del model # Also record per-layer scale (Frobenius norm) for the scale-track PCA scales = {fam: [] for fam in families} for L in range(num_layers): for fam in families: if fam in layers[L]: scales[fam].append(frob(layers[L][fam])) else: scales[fam].append(None) # Pairwise null test pair_results = [] for L in range(num_layers - 1): r = null_test_pair(layers[L], layers[L + 1], families, num_heads, num_kv_heads, head_dim) pair_results.append({"L": L, "L_next": L + 1, "families": r}) # Report print("\n=== Adjacent-layer raw cos-sim per family ===") print(" null interpretation: 1.0 = identical matrices up to scale, 0 = orthogonal") print(f"\n {'L':>3}", end="") for fam in families: if any(fam in pr["families"] for pr in pair_results): print(f" {fam:>12}", end="") print() for pr in pair_results: print(f" {pr['L']:>3}", end="") for fam in families: if fam in pr["families"]: print(f" {pr['families'][fam]['cos']:>+12.4f}", end="") else: print(f" {'':>12}", end="") print() # Summary per family + scalar-T fit comparison # raw_resid = sqrt(2 - 2*cos); scalar_fit = sqrt(1 - cos²) = sin(angle). # random_baseline = sqrt(2) ≈ 1.414. print("\n=== Per-family summary (across all adjacent pairs) ===") print(" random baseline = sqrt(2) ≈ 1.414 (what we'd see with no relationship)") print(f"\n {'family':>14} {'mean_cos':>10} {'median_cos':>11} " f"{'raw_resid':>10} {'scalar_fit':>11} {'improve_frac':>13} {'mean_SE':>8}") for fam in families: cs = [pr["families"].get(fam, {}).get("cos") for pr in pair_results] cs = [x for x in cs if x is not None] rs = [pr["families"].get(fam, {}).get("resid_norm") for pr in pair_results] rs = [x for x in rs if x is not None] ers = [pr["families"].get(fam, {}).get("resid_eff_rank") for pr in pair_results] ers = [x for x in ers if x is not None] ses = [pr["families"].get(fam, {}).get("resid_spec_entropy") for pr in pair_results] ses = [x for x in ses if x is not None] if not cs: continue raw = np.sqrt(np.maximum(2.0 - 2.0 * np.array(cs), 0.0)).mean() scalar_fit = np.sqrt(np.maximum(1.0 - np.array(cs) ** 2, 0.0)).mean() # Improvement fraction: (raw - scalar_fit) / (raw - 0) normalized # to [0, 1] where 0 = scalar does nothing, 1 = scalar reconstructs. improve_frac = (raw - scalar_fit) / max(raw, 1e-12) print(f" {fam:>14} {np.mean(cs):>+10.4f} {np.median(cs):>+11.4f} " f"{raw:>10.4f} {scalar_fit:>11.4f} {improve_frac:>13.4f} " f"{np.mean(ses) if ses else 0:>8.4f}") # Scale-track: Frobenius norm of each family across layers print("\n=== Scale track: ||W_family||_F across layers ===") print(f" {'L':>3}", end="") for fam in families: if any(s is not None for s in scales[fam]): print(f" {fam:>12}", end="") print() for L in range(num_layers): print(f" {L:>3}", end="") for fam in families: if scales[fam][L] is not None: print(f" {scales[fam][L]:>12.4f}", end="") else: print(f" {'':>12}", end="") print() # PCA of log-scale-track to see dimensionality of schedule print("\n=== PCA of log-scale-track (dimensionality of schedule) ===") scale_matrix = [] fam_used = [] for fam in families: vals = scales[fam] if all(v is not None for v in vals): scale_matrix.append(np.log(np.array(vals))) fam_used.append(fam) scale_matrix = np.array(scale_matrix) # (num_families, L) # Center per-family sm_c = scale_matrix - scale_matrix.mean(axis=1, keepdims=True) # SVD: columns are layers, rows are families U, S, Vh = np.linalg.svd(sm_c, full_matrices=False) total = (S ** 2).sum() print(f" explained variance by mode:") for i, s in enumerate(S): pct = float(s ** 2 / max(total, 1e-20)) * 100 print(f" mode {i+1:>2}: {pct:>6.2f}% " f"(loadings per family: " f"{', '.join(f'{fam_used[j]}={U[j, i]:+.2f}' for j in range(len(fam_used)))})") # Save with open(args.out, "w") as f: json.dump({ "model": args.model, "pair_results": pair_results, "scales": scales, "scale_pca_singvals": S.tolist(), "scale_pca_loadings": U.tolist(), "scale_pca_scores": (np.diag(S) @ Vh).tolist(), "fam_used": fam_used, }, f, indent=2) print(f"\nSaved: {args.out}") if __name__ == "__main__": main()