forked from kent/consciousness
157 lines
5.3 KiB
Python
157 lines
5.3 KiB
Python
|
|
"""Analyze aligned_variation output to answer the training-artifact vs
|
||
|
|
specialization question.
|
||
|
|
|
||
|
|
Inputs: qwen3-*-null.json (raw cos-sim) + qwen3-*-aligned.json (aligned cos-sim)
|
||
|
|
|
||
|
|
For each layer pair where aligned data exists, compare:
|
||
|
|
raw_cos(L) — before Procrustes alignment
|
||
|
|
aligned_cos(L) — after Procrustes alignment
|
||
|
|
delta = aligned_cos - raw_cos
|
||
|
|
|
||
|
|
If delta is substantial (aligned much larger than raw), rotation gauge
|
||
|
|
was hiding shared structure → training-artifact hypothesis supported.
|
||
|
|
If delta ≈ 0, specialization is real (rotation can't find shared
|
||
|
|
structure because there isn't any).
|
||
|
|
|
||
|
|
Stratify by phase to test prediction that LATE layers have LARGER delta
|
||
|
|
(more rotation-gauge noise, less real specialization).
|
||
|
|
"""
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
|
||
|
|
def phase_of(L, num_layers):
|
||
|
|
"""Rough phase assignment based on measured 32B entropy boundaries.
|
||
|
|
For other models we'd refit — but shape should be similar."""
|
||
|
|
if num_layers == 64: # Qwen3-32B
|
||
|
|
if L <= 6:
|
||
|
|
return "A"
|
||
|
|
elif L <= 9:
|
||
|
|
return "B"
|
||
|
|
elif L <= 31:
|
||
|
|
return "C"
|
||
|
|
elif L <= 46:
|
||
|
|
return "D"
|
||
|
|
elif L <= 58:
|
||
|
|
return "E"
|
||
|
|
else:
|
||
|
|
return "tail"
|
||
|
|
elif num_layers == 36: # Qwen3-4B
|
||
|
|
if L <= 6:
|
||
|
|
return "A"
|
||
|
|
elif L <= 9:
|
||
|
|
return "B"
|
||
|
|
elif L <= 23:
|
||
|
|
return "C"
|
||
|
|
elif L <= 33:
|
||
|
|
return "D"
|
||
|
|
else:
|
||
|
|
return "tail"
|
||
|
|
else:
|
||
|
|
frac = L / num_layers
|
||
|
|
if frac < 0.11:
|
||
|
|
return "A"
|
||
|
|
elif frac < 0.15:
|
||
|
|
return "B"
|
||
|
|
elif frac < 0.5:
|
||
|
|
return "C"
|
||
|
|
elif frac < 0.75:
|
||
|
|
return "D"
|
||
|
|
elif frac < 0.92:
|
||
|
|
return "E"
|
||
|
|
else:
|
||
|
|
return "tail"
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
ap = argparse.ArgumentParser()
|
||
|
|
ap.add_argument("null_json", help="output of null_residual.py")
|
||
|
|
ap.add_argument("aligned_json", help="output of aligned_variation.py")
|
||
|
|
args = ap.parse_args()
|
||
|
|
|
||
|
|
null = json.load(open(args.null_json))
|
||
|
|
aligned = json.load(open(args.aligned_json))
|
||
|
|
|
||
|
|
num_layers = aligned["num_layers"]
|
||
|
|
aligned_cos = aligned["aligned_cos"] # dict: family -> {L: cos}
|
||
|
|
pair_results = null["pair_results"] # list of {L, L_next, families: {family: {cos, ...}}}
|
||
|
|
|
||
|
|
# Build raw_cos dict from null output
|
||
|
|
raw_cos = {fam: {} for fam in ["q_proj", "k_proj", "v_proj", "o_proj",
|
||
|
|
"gate_proj", "up_proj", "down_proj"]}
|
||
|
|
for pr in pair_results:
|
||
|
|
L = pr["L"]
|
||
|
|
for fam in raw_cos:
|
||
|
|
if fam in pr["families"]:
|
||
|
|
raw_cos[fam][L] = pr["families"][fam]["cos"]
|
||
|
|
|
||
|
|
print(f"=== Aligned vs Raw cos-sim comparison ({args.aligned_json}) ===")
|
||
|
|
print(f" {num_layers} layers total; aligned data for "
|
||
|
|
f"{len(aligned_cos['q_proj'])} pairs\n")
|
||
|
|
|
||
|
|
# Per-pair table: L, phase, family cos-sims raw and aligned
|
||
|
|
families = ["q_proj", "k_proj", "v_proj", "o_proj",
|
||
|
|
"gate_proj", "up_proj", "down_proj"]
|
||
|
|
|
||
|
|
print(f" {'L':>3} {'phase':>5}", end="")
|
||
|
|
for fam in families:
|
||
|
|
print(f" {fam+'_raw':>10} {fam+'_ali':>10}", end="")
|
||
|
|
print()
|
||
|
|
|
||
|
|
L_keys = sorted([int(L) for L in aligned_cos["q_proj"].keys()])
|
||
|
|
for L in L_keys:
|
||
|
|
Lstr = str(L)
|
||
|
|
phase = phase_of(L, num_layers)
|
||
|
|
row = f" {L:>3} {phase:>5}"
|
||
|
|
for fam in families:
|
||
|
|
r = raw_cos[fam].get(L, None)
|
||
|
|
a = aligned_cos[fam].get(Lstr, None)
|
||
|
|
rstr = f"{r:+10.4f}" if r is not None else " N/A"
|
||
|
|
astr = f"{a:+10.4f}" if a is not None else " N/A"
|
||
|
|
row += f" {rstr} {astr}"
|
||
|
|
print(row)
|
||
|
|
|
||
|
|
# Aggregate by phase: mean (aligned - raw) per family per phase
|
||
|
|
print("\n=== Per-phase mean delta (aligned_cos - raw_cos) by family ===")
|
||
|
|
print(f" Large positive delta = rotation alignment revealed shared")
|
||
|
|
print(f" structure. Small delta = specialization is gauge-independent.\n")
|
||
|
|
|
||
|
|
phase_deltas = {}
|
||
|
|
for L in L_keys:
|
||
|
|
Lstr = str(L)
|
||
|
|
ph = phase_of(L, num_layers)
|
||
|
|
for fam in families:
|
||
|
|
r = raw_cos[fam].get(L, None)
|
||
|
|
a = aligned_cos[fam].get(Lstr, None)
|
||
|
|
if r is not None and a is not None:
|
||
|
|
phase_deltas.setdefault(ph, {}).setdefault(fam, []).append(a - r)
|
||
|
|
|
||
|
|
print(f" {'phase':>6}", end="")
|
||
|
|
for fam in families:
|
||
|
|
print(f" {fam:>10}", end="")
|
||
|
|
print()
|
||
|
|
for ph in sorted(phase_deltas.keys()):
|
||
|
|
print(f" {ph:>6}", end="")
|
||
|
|
for fam in families:
|
||
|
|
vals = phase_deltas[ph].get(fam, [])
|
||
|
|
if vals:
|
||
|
|
print(f" {np.mean(vals):+10.4f}", end="")
|
||
|
|
else:
|
||
|
|
print(f" {'—':>10}", end="")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Interpretation
|
||
|
|
print("\n=== Interpretation ===")
|
||
|
|
print(" Prediction under training-artifact hypothesis:")
|
||
|
|
print(" delta(Phase E) > delta(Phase C) for projection families")
|
||
|
|
print(" → late layers have more rotation-gauge-hidden structure")
|
||
|
|
print(" → specialization is partly training noise, not structural")
|
||
|
|
print("")
|
||
|
|
print(" Prediction under real-specialization hypothesis:")
|
||
|
|
print(" delta ~ 0 everywhere")
|
||
|
|
print(" → layers genuinely point in different directions, gauge irrelevant")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|