consciousness/sa-schedule-analyze-aligned.py

"""Analyze aligned_variation output to answer the training-artifact vs
specialization question.

Inputs: qwen3-*-null.json (raw cos-sim) + qwen3-*-aligned.json (aligned cos-sim)

For each layer pair where aligned data exists, compare:
  raw_cos(L)       — before Procrustes alignment
  aligned_cos(L)   — after Procrustes alignment
  delta = aligned_cos - raw_cos

If delta is substantial (aligned much larger than raw), rotation gauge
was hiding shared structure → training-artifact hypothesis supported.
If delta ≈ 0, specialization is real (rotation can't find shared
structure because there isn't any).

Stratify by phase to test prediction that LATE layers have LARGER delta
(more rotation-gauge noise, less real specialization).
"""
import argparse
import json
import numpy as np


def phase_of(L, num_layers):
    """Rough phase assignment based on measured 32B entropy boundaries.
    For other models we'd refit — but shape should be similar."""
    if num_layers == 64:  # Qwen3-32B
        if L <= 6:
            return "A"
        elif L <= 9:
            return "B"
        elif L <= 31:
            return "C"
        elif L <= 46:
            return "D"
        elif L <= 58:
            return "E"
        else:
            return "tail"
    elif num_layers == 36:  # Qwen3-4B
        if L <= 6:
            return "A"
        elif L <= 9:
            return "B"
        elif L <= 23:
            return "C"
        elif L <= 33:
            return "D"
        else:
            return "tail"
    else:
        frac = L / num_layers
        if frac < 0.11:
            return "A"
        elif frac < 0.15:
            return "B"
        elif frac < 0.5:
            return "C"
        elif frac < 0.75:
            return "D"
        elif frac < 0.92:
            return "E"
        else:
            return "tail"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("null_json", help="output of null_residual.py")
    ap.add_argument("aligned_json", help="output of aligned_variation.py")
    args = ap.parse_args()

    null = json.load(open(args.null_json))
    aligned = json.load(open(args.aligned_json))

    num_layers = aligned["num_layers"]
    aligned_cos = aligned["aligned_cos"]     # dict: family -> {L: cos}
    pair_results = null["pair_results"]      # list of {L, L_next, families: {family: {cos, ...}}}

    # Build raw_cos dict from null output
    raw_cos = {fam: {} for fam in ["q_proj", "k_proj", "v_proj", "o_proj",
                                    "gate_proj", "up_proj", "down_proj"]}
    for pr in pair_results:
        L = pr["L"]
        for fam in raw_cos:
            if fam in pr["families"]:
                raw_cos[fam][L] = pr["families"][fam]["cos"]

    print(f"=== Aligned vs Raw cos-sim comparison ({args.aligned_json}) ===")
    print(f"  {num_layers} layers total; aligned data for "
          f"{len(aligned_cos['q_proj'])} pairs\n")

    # Per-pair table: L, phase, family cos-sims raw and aligned
    families = ["q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj"]

    print(f"  {'L':>3}  {'phase':>5}", end="")
    for fam in families:
        print(f"  {fam+'_raw':>10}  {fam+'_ali':>10}", end="")
    print()

    L_keys = sorted([int(L) for L in aligned_cos["q_proj"].keys()])
    for L in L_keys:
        Lstr = str(L)
        phase = phase_of(L, num_layers)
        row = f"  {L:>3}  {phase:>5}"
        for fam in families:
            r = raw_cos[fam].get(L, None)
            a = aligned_cos[fam].get(Lstr, None)
            rstr = f"{r:+10.4f}" if r is not None else "       N/A"
            astr = f"{a:+10.4f}" if a is not None else "       N/A"
            row += f"  {rstr}  {astr}"
        print(row)

    # Aggregate by phase: mean (aligned - raw) per family per phase
    print("\n=== Per-phase mean delta (aligned_cos - raw_cos) by family ===")
    print(f"  Large positive delta = rotation alignment revealed shared")
    print(f"  structure. Small delta = specialization is gauge-independent.\n")

    phase_deltas = {}
    for L in L_keys:
        Lstr = str(L)
        ph = phase_of(L, num_layers)
        for fam in families:
            r = raw_cos[fam].get(L, None)
            a = aligned_cos[fam].get(Lstr, None)
            if r is not None and a is not None:
                phase_deltas.setdefault(ph, {}).setdefault(fam, []).append(a - r)

    print(f"  {'phase':>6}", end="")
    for fam in families:
        print(f"  {fam:>10}", end="")
    print()
    for ph in sorted(phase_deltas.keys()):
        print(f"  {ph:>6}", end="")
        for fam in families:
            vals = phase_deltas[ph].get(fam, [])
            if vals:
                print(f"  {np.mean(vals):+10.4f}", end="")
            else:
                print(f"  {'—':>10}", end="")
        print()

    # Interpretation
    print("\n=== Interpretation ===")
    print("  Prediction under training-artifact hypothesis:")
    print("    delta(Phase E) > delta(Phase C) for projection families")
    print("    → late layers have more rotation-gauge-hidden structure")
    print("    → specialization is partly training noise, not structural")
    print("")
    print("  Prediction under real-specialization hypothesis:")
    print("    delta ~ 0 everywhere")
    print("    → layers genuinely point in different directions, gauge irrelevant")


if __name__ == "__main__":
    main()