consciousness/sa-schedule-analyze-grams.py

168 lines
8.1 KiB
Python
Raw Normal View History

"""Analyze operator-level inter-layer alignment from the grams + eigdirs files.
Input:
qwen3-4b-grams.json (gram[L,L',h], fro_sq[L,h])
qwen3-4b-grams-eigdirs.pt (eig_dirs[L,h,topk,hidden], sym_eigs[L,h,2*head_dim])
Questions:
(a) Operator cos-sim between layers. cos(g_L^h, g_L'^h) = gram / √(fro_sq fro_sq').
If ~1 same operator up to scalar. If low distinct operators.
(b) Scalar-rescale residual using full operator (not spectrum):
optimal T = gram / fro_sq', residual_frac = √(1 - cos²).
(c) Curvature-sign alignment. For each (L, anchor) pair, what fraction of
top-k signed eigenvalues share sign with the anchor's?
(d) Top-k eigensubspace alignment. Principal angles between span{eig_dirs_L}
and span{eig_dirs_anchor}.
Compare: operator cos-sim vs spectral cos-sim (from prior analysis). The
sheaf-rs finding was that spectral shape converges across layers while
eigenvectors don't. We want to confirm/refute that within QK in Qwen3-4B.
"""
import argparse
import json
import numpy as np
import torch
def main():
ap = argparse.ArgumentParser()
ap.add_argument("gram_json")
ap.add_argument("--anchor", type=int, default=-1,
help="anchor layer index; -1 = last")
args = ap.parse_args()
with open(args.gram_json) as f:
d = json.load(f)
num_layers = d["num_layers"]
num_heads = d["num_heads"]
head_dim = d["head_dim"]
hidden = d["hidden_size"]
gram = np.array(d["gram"]) # (L, L, H)
# NOTE: fro_sq from the json is ||W_K W_Q^T||_F^2 (the measure.py
# shortcut), NOT ||g||_F^2 = ||W_K^T W_Q||_F^2 which is what the gram
# diagonal gives. Different objects. Use gram diagonal for normalization.
diag_sq = np.array([[gram[L, L, h] for h in range(num_heads)]
for L in range(num_layers)]) # (L, H)
diag = np.sqrt(np.maximum(diag_sq, 1e-20)) # ||g_L^h||_F
pt = torch.load(d.get("eigdirs_path", args.gram_json.replace(".json", "-eigdirs.pt")),
weights_only=True)
eig_dirs = pt["eig_dirs"].double().numpy() # (L, H, topk, hidden)
sym_eigs = pt["sym_eigs"].double().numpy() # (L, H, 2*head_dim)
topk = eig_dirs.shape[2]
anchor = args.anchor if args.anchor >= 0 else num_layers - 1
# ==========================================================
# (a) Operator cos-sim matrix, averaged over heads
# ==========================================================
cos_mat = np.zeros((num_layers, num_layers))
for L in range(num_layers):
for Lp in range(num_layers):
denom = diag[L] * diag[Lp]
per_h = gram[L, Lp] / np.maximum(denom, 1e-20)
cos_mat[L, Lp] = per_h.mean()
print(f"=== (a) Operator cos-sim between layers, averaged over {num_heads} heads ===")
print(f" diagonal (should be 1.0): mean {np.diag(cos_mat).mean():.4f}")
# Adjacent-layer cos-sim
adj = np.array([cos_mat[L, L+1] for L in range(num_layers-1)])
print(f" adjacent layers cos-sim: mean {adj.mean():.4f} min {adj.min():.4f} max {adj.max():.4f}")
# Layer-to-anchor cos-sim
to_anchor = cos_mat[:, anchor]
print(f" layer -> anchor L={anchor} cos-sim:")
print(f" {'L':>3} {'cos':>7} {'T_opt':>7} {'resid_frac':>10}")
for L in range(num_layers):
c = to_anchor[L]
T = float(np.mean(gram[L, anchor] / np.maximum(diag_sq[anchor], 1e-20)))
r = float(np.sqrt(max(1.0 - c**2, 0.0)))
print(f" {L:>3} {c:+.4f} {T:+7.3f} {r:>10.4f}")
# Long-range cos-sim (L=0 to L=35 vs L=17 to L=35 etc.)
print(f"\n long-range: cos(L=0, last) = {cos_mat[0, -1]:+.3f} "
f"cos(L=midish, last) = {cos_mat[num_layers//2, -1]:+.3f}")
# ==========================================================
# (b) Full scalar-rescale residual using the gram
# ==========================================================
print(f"\n=== (b) Operator-level scalar rescale to anchor L={anchor} ===")
# residual_frac² = 1 - cos²(g_L, g_anchor) (per head)
print(f" {'L':>3} {'mean_cos':>9} {'mean_resid':>10}")
for L in range(num_layers):
per_h_cos = gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)
per_h_resid = np.sqrt(np.clip(1.0 - per_h_cos**2, 0.0, 1.0))
print(f" {L:>3} {per_h_cos.mean():>+9.4f} {per_h_resid.mean():>10.4f}")
# ==========================================================
# (c) Curvature-sign alignment
# ==========================================================
print(f"\n=== (c) Curvature-sign alignment vs anchor L={anchor} ===")
# Look at top-k eigenvalues by magnitude (already sorted that way in measure).
# Fraction of top-k (L, h) whose sign matches the anchor's i-th eigenvalue.
for k_use in [2, 4, 8, 16, 32, 64, 128, 256]:
if k_use > sym_eigs.shape[-1]:
continue
# sign of top-k_use eigenvalues at layer L vs at anchor, per (L, h)
sign_L = np.sign(sym_eigs[:, :, :k_use]) # (L, H, k_use)
sign_a = np.sign(sym_eigs[anchor, :, :k_use]) # (H, k_use)
agree = (sign_L == sign_a[None, :, :]).mean(axis=-1) # (L, H)
print(f" top-{k_use:>3} signs: mean agree = {agree.mean():.3f} "
f"by layer range: early {agree[:12].mean():.3f} "
f"mid {agree[12:24].mean():.3f} late {agree[24:].mean():.3f}")
# Also: distribution of sign-balance per layer (fraction positive eigenvalues)
frac_pos = (sym_eigs[:, :, :2 * head_dim] > 0).mean(axis=(1, 2))
print(f"\n fraction positive eigenvalues per layer:")
for L in range(num_layers):
print(f" L={L:2} frac+ = {frac_pos[L]:.3f}")
# ==========================================================
# (d) Eigenspace principal angles
# ==========================================================
print(f"\n=== (d) Top-{topk} eigensubspace principal angles vs anchor L={anchor} ===")
# Per-head: cos of principal angles between row-spans of eig_dirs[L, h]
# and eig_dirs[anchor, h]. Report mean cos angle per layer.
print(f" {'L':>3} {'meanCosPA':>10} {'minCosPA':>10} {'max_top1':>10}")
for L in range(num_layers):
mean_cos_pa_per_h = []
min_cos_pa_per_h = []
top1_overlap = []
for h in range(num_heads):
A = eig_dirs[L, h] # (topk, hidden) rows are unit vectors
B = eig_dirs[anchor, h] # (topk, hidden)
# Orthonormalize rows (they're close-to-orthonormal but not exactly)
Qa, _ = np.linalg.qr(A.T) # hidden × topk
Qb, _ = np.linalg.qr(B.T)
M = Qa.T @ Qb # topk × topk
s = np.linalg.svd(M, compute_uv=False)
mean_cos_pa_per_h.append(s.mean())
min_cos_pa_per_h.append(s.min())
# |<a_0, b_0>|² — top-1 eigenvector overlap
top1_overlap.append(float((A[0] @ B[0]) ** 2))
print(f" {L:>3} {np.mean(mean_cos_pa_per_h):>10.4f} "
f"{np.mean(min_cos_pa_per_h):>10.4f} "
f"{np.mean(top1_overlap):>10.4f}")
# ==========================================================
# Verdict
# ==========================================================
to_anchor_per_head = np.array([
(gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)).mean()
for L in range(num_layers)
])
mean_cos_to_anchor = to_anchor_per_head.mean()
print(f"\n=== Verdict ===")
print(f" mean operator cos-sim to anchor: {mean_cos_to_anchor:+.4f}")
adj_mean = adj.mean()
print(f" mean operator cos-sim adjacent layers: {adj_mean:+.4f}")
if mean_cos_to_anchor > 0.9:
print(" STRONG: same operator up to scalar across all layers.")
elif mean_cos_to_anchor > 0.5:
print(" MEDIUM: substantial shared operator, but layer-specific drift.")
elif mean_cos_to_anchor > 0.1:
print(" WEAK: some alignment; far from single-operator interpretation.")
else:
print(" REJECTED: operators are effectively orthogonal across layers.")
if __name__ == "__main__":
main()