consciousness/sa-schedule-gamma-directions.py

"""Pull input_layernorm.γ vectors from a model and analyze direction
structure across layers.

Question: is γ just scalar magnitude (isotropic SA) or does each layer
have a preferred direction (anisotropic SA / geometry-aware)?

Decomposition: γ_L = ||γ_L|| · γ_L̂
  - ||γ_L|| is what our scalar Kirkpatrick fit captured
  - γ_L̂ is unit direction — if layers share direction, γ is rank-1 +
    scaling (classical isotropic). If directions differ per layer, γ
    encodes per-layer preferred axis (anisotropic).

We also look at:
  - pairwise cos-sim between γ_L̂ across layers
  - principal components of [γ_L̂]_L (stacked matrix)
  - per-phase structure: is Phase E more anisotropic than Phase C?
"""
import argparse
import numpy as np
import torch
from transformers import AutoModelForCausalLM


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", default="Qwen/Qwen3-32B")
    ap.add_argument("--out", default="/tmp/gamma-dirs.json")
    args = ap.parse_args()

    print(f"Loading {args.model} (CPU, layernorm params only)...", flush=True)
    m = AutoModelForCausalLM.from_pretrained(
        args.model, torch_dtype=torch.float32, device_map="cpu",
        trust_remote_code=True,
    )
    num_layers = m.config.num_hidden_layers
    hidden = m.config.hidden_size
    print(f"  L={num_layers}, hidden={hidden}", flush=True)

    gammas = np.stack([
        m.model.layers[L].input_layernorm.weight.detach().float().cpu().numpy()
        for L in range(num_layers)
    ])  # (L, hidden)
    del m

    norms = np.linalg.norm(gammas, axis=1)
    units = gammas / norms[:, None]

    # Pairwise cos-sim of unit γ
    cos_mat = units @ units.T  # (L, L)

    # PCA on unit vectors
    centered = units - units.mean(axis=0, keepdims=True)
    _, S, Vt = np.linalg.svd(centered, full_matrices=False)
    explained = S**2 / (S**2).sum()

    # How much of each γ_L unit is explained by top-1 direction (shared)?
    top1 = Vt[0]  # (hidden,)
    proj_top1 = units @ top1  # (L,)
    residual_after_top1 = np.sqrt(np.maximum(1 - proj_top1**2, 0))

    # Per-phase summary (Qwen3-32B boundaries)
    def phase(L):
        if L <= 6: return "A"
        if L <= 9: return "B"
        if L <= 31: return "C"
        if L <= 46: return "D"
        if L <= 58: return "E"
        return "tail"

    phase_ls = {}
    for L in range(num_layers):
        phase_ls.setdefault(phase(L), []).append(L)

    print(f"\n=== ||γ_L|| per layer (scalar magnitude) ===")
    for L in range(num_layers):
        print(f"  L={L:>2}  phase={phase(L):>5}  ||γ||={norms[L]:>8.3f}  "
              f"proj_top1={proj_top1[L]:>+.4f}  resid={residual_after_top1[L]:>.4f}")

    print(f"\n=== PCA of unit γ vectors (direction structure) ===")
    print(f"  Explained variance, top 10 components:")
    for i in range(min(10, len(S))):
        print(f"    PC{i}: {explained[i]:.4f}  (singular_val={S[i]:.4f})")
    print(f"  Top-3 explain: {explained[:3].sum():.4f}")
    print(f"  Top-10 explain: {explained[:10].sum():.4f}")

    print(f"\n=== Per-phase direction statistics ===")
    print(f"  {'phase':>6}  {'N':>3}  {'||γ||_mean':>10}  {'||γ||_std':>9}  "
          f"{'intra_cos':>9}  {'vs_other_cos':>12}")
    for ph, Ls in phase_ls.items():
        u = units[Ls]
        intra = (u @ u.T)[np.triu_indices(len(Ls), k=1)]
        intra_mean = intra.mean() if len(intra) > 0 else 1.0
        # Vs other phases
        other_Ls = [L for L in range(num_layers) if L not in Ls]
        if other_Ls:
            u_other = units[other_Ls]
            vs = u @ u_other.T
            vs_mean = vs.mean()
        else:
            vs_mean = 0.0
        print(f"  {ph:>6}  {len(Ls):>3}  {norms[Ls].mean():>10.3f}  "
              f"{norms[Ls].std():>9.3f}  {intra_mean:>+9.4f}  {vs_mean:>+12.4f}")

    print(f"\n=== Adjacent-pair unit-γ cos-sim ===")
    for L in range(num_layers - 1):
        print(f"  L={L:>2}→{L+1:>2}  phase={phase(L):>5}  cos={cos_mat[L, L+1]:>+.4f}")

    import json
    with open(args.out, "w") as f:
        json.dump({
            "model": args.model,
            "num_layers": num_layers,
            "norms": norms.tolist(),
            "proj_top1": proj_top1.tolist(),
            "explained_var": explained.tolist(),
            "cos_adjacent": [float(cos_mat[L, L+1]) for L in range(num_layers - 1)],
        }, f, indent=2)
    print(f"\nSaved: {args.out}")


if __name__ == "__main__":
    main()