consciousness/sa-schedule-analyze-grams.py

"""Analyze operator-level inter-layer alignment from the grams + eigdirs files.

Input:
  qwen3-4b-grams.json       (gram[L,L',h], fro_sq[L,h])
  qwen3-4b-grams-eigdirs.pt (eig_dirs[L,h,topk,hidden], sym_eigs[L,h,2*head_dim])

Questions:
  (a) Operator cos-sim between layers.  cos(g_L^h, g_L'^h) = gram / √(fro_sq fro_sq').
      If ~1 → same operator up to scalar. If low → distinct operators.
  (b) Scalar-rescale residual using full operator (not spectrum):
      optimal T = gram / fro_sq', residual_frac = √(1 - cos²).
  (c) Curvature-sign alignment.  For each (L, anchor) pair, what fraction of
      top-k signed eigenvalues share sign with the anchor's?
  (d) Top-k eigensubspace alignment.  Principal angles between span{eig_dirs_L}
      and span{eig_dirs_anchor}.

  Compare: operator cos-sim vs spectral cos-sim (from prior analysis). The
  sheaf-rs finding was that spectral shape converges across layers while
  eigenvectors don't. We want to confirm/refute that within QK in Qwen3-4B.
"""
import argparse
import json
import numpy as np
import torch


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("gram_json")
    ap.add_argument("--anchor", type=int, default=-1,
                    help="anchor layer index; -1 = last")
    args = ap.parse_args()

    with open(args.gram_json) as f:
        d = json.load(f)
    num_layers = d["num_layers"]
    num_heads = d["num_heads"]
    head_dim = d["head_dim"]
    hidden = d["hidden_size"]
    gram = np.array(d["gram"])                           # (L, L, H)
    # NOTE: fro_sq from the json is ||W_K W_Q^T||_F^2 (the measure.py
    # shortcut), NOT ||g||_F^2 = ||W_K^T W_Q||_F^2 which is what the gram
    # diagonal gives. Different objects. Use gram diagonal for normalization.
    diag_sq = np.array([[gram[L, L, h] for h in range(num_heads)]
                        for L in range(num_layers)])     # (L, H)
    diag = np.sqrt(np.maximum(diag_sq, 1e-20))           # ||g_L^h||_F

    pt = torch.load(d.get("eigdirs_path", args.gram_json.replace(".json", "-eigdirs.pt")),
                    weights_only=True)
    eig_dirs = pt["eig_dirs"].double().numpy()           # (L, H, topk, hidden)
    sym_eigs = pt["sym_eigs"].double().numpy()           # (L, H, 2*head_dim)
    topk = eig_dirs.shape[2]
    anchor = args.anchor if args.anchor >= 0 else num_layers - 1

    # ==========================================================
    # (a) Operator cos-sim matrix, averaged over heads
    # ==========================================================
    cos_mat = np.zeros((num_layers, num_layers))
    for L in range(num_layers):
        for Lp in range(num_layers):
            denom = diag[L] * diag[Lp]
            per_h = gram[L, Lp] / np.maximum(denom, 1e-20)
            cos_mat[L, Lp] = per_h.mean()

    print(f"=== (a) Operator cos-sim between layers, averaged over {num_heads} heads ===")
    print(f"  diagonal (should be 1.0): mean {np.diag(cos_mat).mean():.4f}")
    # Adjacent-layer cos-sim
    adj = np.array([cos_mat[L, L+1] for L in range(num_layers-1)])
    print(f"  adjacent layers cos-sim: mean {adj.mean():.4f}  min {adj.min():.4f}  max {adj.max():.4f}")
    # Layer-to-anchor cos-sim
    to_anchor = cos_mat[:, anchor]
    print(f"  layer -> anchor L={anchor} cos-sim:")
    print(f"    {'L':>3}  {'cos':>7}  {'T_opt':>7}  {'resid_frac':>10}")
    for L in range(num_layers):
        c = to_anchor[L]
        T = float(np.mean(gram[L, anchor] / np.maximum(diag_sq[anchor], 1e-20)))
        r = float(np.sqrt(max(1.0 - c**2, 0.0)))
        print(f"    {L:>3}  {c:+.4f}  {T:+7.3f}  {r:>10.4f}")

    # Long-range cos-sim (L=0 to L=35 vs L=17 to L=35 etc.)
    print(f"\n  long-range: cos(L=0, last) = {cos_mat[0, -1]:+.3f}  "
          f"cos(L=midish, last) = {cos_mat[num_layers//2, -1]:+.3f}")

    # ==========================================================
    # (b) Full scalar-rescale residual using the gram
    # ==========================================================
    print(f"\n=== (b) Operator-level scalar rescale to anchor L={anchor} ===")
    # residual_frac² = 1 - cos²(g_L, g_anchor)   (per head)
    print(f"  {'L':>3}  {'mean_cos':>9}  {'mean_resid':>10}")
    for L in range(num_layers):
        per_h_cos = gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)
        per_h_resid = np.sqrt(np.clip(1.0 - per_h_cos**2, 0.0, 1.0))
        print(f"  {L:>3}  {per_h_cos.mean():>+9.4f}  {per_h_resid.mean():>10.4f}")

    # ==========================================================
    # (c) Curvature-sign alignment
    # ==========================================================
    print(f"\n=== (c) Curvature-sign alignment vs anchor L={anchor} ===")
    # Look at top-k eigenvalues by magnitude (already sorted that way in measure).
    # Fraction of top-k (L, h) whose sign matches the anchor's i-th eigenvalue.
    for k_use in [2, 4, 8, 16, 32, 64, 128, 256]:
        if k_use > sym_eigs.shape[-1]:
            continue
        # sign of top-k_use eigenvalues at layer L vs at anchor, per (L, h)
        sign_L = np.sign(sym_eigs[:, :, :k_use])          # (L, H, k_use)
        sign_a = np.sign(sym_eigs[anchor, :, :k_use])     # (H, k_use)
        agree = (sign_L == sign_a[None, :, :]).mean(axis=-1)   # (L, H)
        print(f"  top-{k_use:>3} signs: mean agree = {agree.mean():.3f}  "
              f"by layer range: early {agree[:12].mean():.3f} "
              f"mid {agree[12:24].mean():.3f} late {agree[24:].mean():.3f}")

    # Also: distribution of sign-balance per layer (fraction positive eigenvalues)
    frac_pos = (sym_eigs[:, :, :2 * head_dim] > 0).mean(axis=(1, 2))
    print(f"\n  fraction positive eigenvalues per layer:")
    for L in range(num_layers):
        print(f"    L={L:2}  frac+ = {frac_pos[L]:.3f}")

    # ==========================================================
    # (d) Eigenspace principal angles
    # ==========================================================
    print(f"\n=== (d) Top-{topk} eigensubspace principal angles vs anchor L={anchor} ===")
    # Per-head: cos of principal angles between row-spans of eig_dirs[L, h]
    # and eig_dirs[anchor, h]. Report mean cos angle per layer.
    print(f"  {'L':>3}  {'meanCosPA':>10}  {'minCosPA':>10}  {'max_top1':>10}")
    for L in range(num_layers):
        mean_cos_pa_per_h = []
        min_cos_pa_per_h = []
        top1_overlap = []
        for h in range(num_heads):
            A = eig_dirs[L, h]           # (topk, hidden)  rows are unit vectors
            B = eig_dirs[anchor, h]      # (topk, hidden)
            # Orthonormalize rows (they're close-to-orthonormal but not exactly)
            Qa, _ = np.linalg.qr(A.T)    # hidden × topk
            Qb, _ = np.linalg.qr(B.T)
            M = Qa.T @ Qb                # topk × topk
            s = np.linalg.svd(M, compute_uv=False)
            mean_cos_pa_per_h.append(s.mean())
            min_cos_pa_per_h.append(s.min())
            # |<a_0, b_0>|² — top-1 eigenvector overlap
            top1_overlap.append(float((A[0] @ B[0]) ** 2))
        print(f"  {L:>3}  {np.mean(mean_cos_pa_per_h):>10.4f}  "
              f"{np.mean(min_cos_pa_per_h):>10.4f}  "
              f"{np.mean(top1_overlap):>10.4f}")

    # ==========================================================
    # Verdict
    # ==========================================================
    to_anchor_per_head = np.array([
        (gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)).mean()
        for L in range(num_layers)
    ])
    mean_cos_to_anchor = to_anchor_per_head.mean()
    print(f"\n=== Verdict ===")
    print(f"  mean operator cos-sim to anchor: {mean_cos_to_anchor:+.4f}")
    adj_mean = adj.mean()
    print(f"  mean operator cos-sim adjacent layers: {adj_mean:+.4f}")
    if mean_cos_to_anchor > 0.9:
        print("  STRONG: same operator up to scalar across all layers.")
    elif mean_cos_to_anchor > 0.5:
        print("  MEDIUM: substantial shared operator, but layer-specific drift.")
    elif mean_cos_to_anchor > 0.1:
        print("  WEAK: some alignment; far from single-operator interpretation.")
    else:
        print("  REJECTED: operators are effectively orthogonal across layers.")


if __name__ == "__main__":
    main()