"""Analyze aligned_variation output to answer the training-artifact vs specialization question. Inputs: qwen3-*-null.json (raw cos-sim) + qwen3-*-aligned.json (aligned cos-sim) For each layer pair where aligned data exists, compare: raw_cos(L) — before Procrustes alignment aligned_cos(L) — after Procrustes alignment delta = aligned_cos - raw_cos If delta is substantial (aligned much larger than raw), rotation gauge was hiding shared structure → training-artifact hypothesis supported. If delta ≈ 0, specialization is real (rotation can't find shared structure because there isn't any). Stratify by phase to test prediction that LATE layers have LARGER delta (more rotation-gauge noise, less real specialization). """ import argparse import json import numpy as np def phase_of(L, num_layers): """Rough phase assignment based on measured 32B entropy boundaries. For other models we'd refit — but shape should be similar.""" if num_layers == 64: # Qwen3-32B if L <= 6: return "A" elif L <= 9: return "B" elif L <= 31: return "C" elif L <= 46: return "D" elif L <= 58: return "E" else: return "tail" elif num_layers == 36: # Qwen3-4B if L <= 6: return "A" elif L <= 9: return "B" elif L <= 23: return "C" elif L <= 33: return "D" else: return "tail" else: frac = L / num_layers if frac < 0.11: return "A" elif frac < 0.15: return "B" elif frac < 0.5: return "C" elif frac < 0.75: return "D" elif frac < 0.92: return "E" else: return "tail" def main(): ap = argparse.ArgumentParser() ap.add_argument("null_json", help="output of null_residual.py") ap.add_argument("aligned_json", help="output of aligned_variation.py") args = ap.parse_args() null = json.load(open(args.null_json)) aligned = json.load(open(args.aligned_json)) num_layers = aligned["num_layers"] aligned_cos = aligned["aligned_cos"] # dict: family -> {L: cos} pair_results = null["pair_results"] # list of {L, L_next, families: {family: {cos, ...}}} # Build raw_cos dict from null output raw_cos = {fam: {} for fam in ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]} for pr in pair_results: L = pr["L"] for fam in raw_cos: if fam in pr["families"]: raw_cos[fam][L] = pr["families"][fam]["cos"] print(f"=== Aligned vs Raw cos-sim comparison ({args.aligned_json}) ===") print(f" {num_layers} layers total; aligned data for " f"{len(aligned_cos['q_proj'])} pairs\n") # Per-pair table: L, phase, family cos-sims raw and aligned families = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] print(f" {'L':>3} {'phase':>5}", end="") for fam in families: print(f" {fam+'_raw':>10} {fam+'_ali':>10}", end="") print() L_keys = sorted([int(L) for L in aligned_cos["q_proj"].keys()]) for L in L_keys: Lstr = str(L) phase = phase_of(L, num_layers) row = f" {L:>3} {phase:>5}" for fam in families: r = raw_cos[fam].get(L, None) a = aligned_cos[fam].get(Lstr, None) rstr = f"{r:+10.4f}" if r is not None else " N/A" astr = f"{a:+10.4f}" if a is not None else " N/A" row += f" {rstr} {astr}" print(row) # Aggregate by phase: mean (aligned - raw) per family per phase print("\n=== Per-phase mean delta (aligned_cos - raw_cos) by family ===") print(f" Large positive delta = rotation alignment revealed shared") print(f" structure. Small delta = specialization is gauge-independent.\n") phase_deltas = {} for L in L_keys: Lstr = str(L) ph = phase_of(L, num_layers) for fam in families: r = raw_cos[fam].get(L, None) a = aligned_cos[fam].get(Lstr, None) if r is not None and a is not None: phase_deltas.setdefault(ph, {}).setdefault(fam, []).append(a - r) print(f" {'phase':>6}", end="") for fam in families: print(f" {fam:>10}", end="") print() for ph in sorted(phase_deltas.keys()): print(f" {ph:>6}", end="") for fam in families: vals = phase_deltas[ph].get(fam, []) if vals: print(f" {np.mean(vals):+10.4f}", end="") else: print(f" {'—':>10}", end="") print() # Interpretation print("\n=== Interpretation ===") print(" Prediction under training-artifact hypothesis:") print(" delta(Phase E) > delta(Phase C) for projection families") print(" → late layers have more rotation-gauge-hidden structure") print(" → specialization is partly training noise, not structural") print("") print(" Prediction under real-specialization hypothesis:") print(" delta ~ 0 everywhere") print(" → layers genuinely point in different directions, gauge irrelevant") if __name__ == "__main__": main()