forked from kent/consciousness
168 lines
8.1 KiB
Python
168 lines
8.1 KiB
Python
|
|
"""Analyze operator-level inter-layer alignment from the grams + eigdirs files.
|
|||
|
|
|
|||
|
|
Input:
|
|||
|
|
qwen3-4b-grams.json (gram[L,L',h], fro_sq[L,h])
|
|||
|
|
qwen3-4b-grams-eigdirs.pt (eig_dirs[L,h,topk,hidden], sym_eigs[L,h,2*head_dim])
|
|||
|
|
|
|||
|
|
Questions:
|
|||
|
|
(a) Operator cos-sim between layers. cos(g_L^h, g_L'^h) = gram / √(fro_sq fro_sq').
|
|||
|
|
If ~1 → same operator up to scalar. If low → distinct operators.
|
|||
|
|
(b) Scalar-rescale residual using full operator (not spectrum):
|
|||
|
|
optimal T = gram / fro_sq', residual_frac = √(1 - cos²).
|
|||
|
|
(c) Curvature-sign alignment. For each (L, anchor) pair, what fraction of
|
|||
|
|
top-k signed eigenvalues share sign with the anchor's?
|
|||
|
|
(d) Top-k eigensubspace alignment. Principal angles between span{eig_dirs_L}
|
|||
|
|
and span{eig_dirs_anchor}.
|
|||
|
|
|
|||
|
|
Compare: operator cos-sim vs spectral cos-sim (from prior analysis). The
|
|||
|
|
sheaf-rs finding was that spectral shape converges across layers while
|
|||
|
|
eigenvectors don't. We want to confirm/refute that within QK in Qwen3-4B.
|
|||
|
|
"""
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import numpy as np
|
|||
|
|
import torch
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
ap = argparse.ArgumentParser()
|
|||
|
|
ap.add_argument("gram_json")
|
|||
|
|
ap.add_argument("--anchor", type=int, default=-1,
|
|||
|
|
help="anchor layer index; -1 = last")
|
|||
|
|
args = ap.parse_args()
|
|||
|
|
|
|||
|
|
with open(args.gram_json) as f:
|
|||
|
|
d = json.load(f)
|
|||
|
|
num_layers = d["num_layers"]
|
|||
|
|
num_heads = d["num_heads"]
|
|||
|
|
head_dim = d["head_dim"]
|
|||
|
|
hidden = d["hidden_size"]
|
|||
|
|
gram = np.array(d["gram"]) # (L, L, H)
|
|||
|
|
# NOTE: fro_sq from the json is ||W_K W_Q^T||_F^2 (the measure.py
|
|||
|
|
# shortcut), NOT ||g||_F^2 = ||W_K^T W_Q||_F^2 which is what the gram
|
|||
|
|
# diagonal gives. Different objects. Use gram diagonal for normalization.
|
|||
|
|
diag_sq = np.array([[gram[L, L, h] for h in range(num_heads)]
|
|||
|
|
for L in range(num_layers)]) # (L, H)
|
|||
|
|
diag = np.sqrt(np.maximum(diag_sq, 1e-20)) # ||g_L^h||_F
|
|||
|
|
|
|||
|
|
pt = torch.load(d.get("eigdirs_path", args.gram_json.replace(".json", "-eigdirs.pt")),
|
|||
|
|
weights_only=True)
|
|||
|
|
eig_dirs = pt["eig_dirs"].double().numpy() # (L, H, topk, hidden)
|
|||
|
|
sym_eigs = pt["sym_eigs"].double().numpy() # (L, H, 2*head_dim)
|
|||
|
|
topk = eig_dirs.shape[2]
|
|||
|
|
anchor = args.anchor if args.anchor >= 0 else num_layers - 1
|
|||
|
|
|
|||
|
|
# ==========================================================
|
|||
|
|
# (a) Operator cos-sim matrix, averaged over heads
|
|||
|
|
# ==========================================================
|
|||
|
|
cos_mat = np.zeros((num_layers, num_layers))
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
for Lp in range(num_layers):
|
|||
|
|
denom = diag[L] * diag[Lp]
|
|||
|
|
per_h = gram[L, Lp] / np.maximum(denom, 1e-20)
|
|||
|
|
cos_mat[L, Lp] = per_h.mean()
|
|||
|
|
|
|||
|
|
print(f"=== (a) Operator cos-sim between layers, averaged over {num_heads} heads ===")
|
|||
|
|
print(f" diagonal (should be 1.0): mean {np.diag(cos_mat).mean():.4f}")
|
|||
|
|
# Adjacent-layer cos-sim
|
|||
|
|
adj = np.array([cos_mat[L, L+1] for L in range(num_layers-1)])
|
|||
|
|
print(f" adjacent layers cos-sim: mean {adj.mean():.4f} min {adj.min():.4f} max {adj.max():.4f}")
|
|||
|
|
# Layer-to-anchor cos-sim
|
|||
|
|
to_anchor = cos_mat[:, anchor]
|
|||
|
|
print(f" layer -> anchor L={anchor} cos-sim:")
|
|||
|
|
print(f" {'L':>3} {'cos':>7} {'T_opt':>7} {'resid_frac':>10}")
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
c = to_anchor[L]
|
|||
|
|
T = float(np.mean(gram[L, anchor] / np.maximum(diag_sq[anchor], 1e-20)))
|
|||
|
|
r = float(np.sqrt(max(1.0 - c**2, 0.0)))
|
|||
|
|
print(f" {L:>3} {c:+.4f} {T:+7.3f} {r:>10.4f}")
|
|||
|
|
|
|||
|
|
# Long-range cos-sim (L=0 to L=35 vs L=17 to L=35 etc.)
|
|||
|
|
print(f"\n long-range: cos(L=0, last) = {cos_mat[0, -1]:+.3f} "
|
|||
|
|
f"cos(L=midish, last) = {cos_mat[num_layers//2, -1]:+.3f}")
|
|||
|
|
|
|||
|
|
# ==========================================================
|
|||
|
|
# (b) Full scalar-rescale residual using the gram
|
|||
|
|
# ==========================================================
|
|||
|
|
print(f"\n=== (b) Operator-level scalar rescale to anchor L={anchor} ===")
|
|||
|
|
# residual_frac² = 1 - cos²(g_L, g_anchor) (per head)
|
|||
|
|
print(f" {'L':>3} {'mean_cos':>9} {'mean_resid':>10}")
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
per_h_cos = gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)
|
|||
|
|
per_h_resid = np.sqrt(np.clip(1.0 - per_h_cos**2, 0.0, 1.0))
|
|||
|
|
print(f" {L:>3} {per_h_cos.mean():>+9.4f} {per_h_resid.mean():>10.4f}")
|
|||
|
|
|
|||
|
|
# ==========================================================
|
|||
|
|
# (c) Curvature-sign alignment
|
|||
|
|
# ==========================================================
|
|||
|
|
print(f"\n=== (c) Curvature-sign alignment vs anchor L={anchor} ===")
|
|||
|
|
# Look at top-k eigenvalues by magnitude (already sorted that way in measure).
|
|||
|
|
# Fraction of top-k (L, h) whose sign matches the anchor's i-th eigenvalue.
|
|||
|
|
for k_use in [2, 4, 8, 16, 32, 64, 128, 256]:
|
|||
|
|
if k_use > sym_eigs.shape[-1]:
|
|||
|
|
continue
|
|||
|
|
# sign of top-k_use eigenvalues at layer L vs at anchor, per (L, h)
|
|||
|
|
sign_L = np.sign(sym_eigs[:, :, :k_use]) # (L, H, k_use)
|
|||
|
|
sign_a = np.sign(sym_eigs[anchor, :, :k_use]) # (H, k_use)
|
|||
|
|
agree = (sign_L == sign_a[None, :, :]).mean(axis=-1) # (L, H)
|
|||
|
|
print(f" top-{k_use:>3} signs: mean agree = {agree.mean():.3f} "
|
|||
|
|
f"by layer range: early {agree[:12].mean():.3f} "
|
|||
|
|
f"mid {agree[12:24].mean():.3f} late {agree[24:].mean():.3f}")
|
|||
|
|
|
|||
|
|
# Also: distribution of sign-balance per layer (fraction positive eigenvalues)
|
|||
|
|
frac_pos = (sym_eigs[:, :, :2 * head_dim] > 0).mean(axis=(1, 2))
|
|||
|
|
print(f"\n fraction positive eigenvalues per layer:")
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
print(f" L={L:2} frac+ = {frac_pos[L]:.3f}")
|
|||
|
|
|
|||
|
|
# ==========================================================
|
|||
|
|
# (d) Eigenspace principal angles
|
|||
|
|
# ==========================================================
|
|||
|
|
print(f"\n=== (d) Top-{topk} eigensubspace principal angles vs anchor L={anchor} ===")
|
|||
|
|
# Per-head: cos of principal angles between row-spans of eig_dirs[L, h]
|
|||
|
|
# and eig_dirs[anchor, h]. Report mean cos angle per layer.
|
|||
|
|
print(f" {'L':>3} {'meanCosPA':>10} {'minCosPA':>10} {'max_top1':>10}")
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
mean_cos_pa_per_h = []
|
|||
|
|
min_cos_pa_per_h = []
|
|||
|
|
top1_overlap = []
|
|||
|
|
for h in range(num_heads):
|
|||
|
|
A = eig_dirs[L, h] # (topk, hidden) rows are unit vectors
|
|||
|
|
B = eig_dirs[anchor, h] # (topk, hidden)
|
|||
|
|
# Orthonormalize rows (they're close-to-orthonormal but not exactly)
|
|||
|
|
Qa, _ = np.linalg.qr(A.T) # hidden × topk
|
|||
|
|
Qb, _ = np.linalg.qr(B.T)
|
|||
|
|
M = Qa.T @ Qb # topk × topk
|
|||
|
|
s = np.linalg.svd(M, compute_uv=False)
|
|||
|
|
mean_cos_pa_per_h.append(s.mean())
|
|||
|
|
min_cos_pa_per_h.append(s.min())
|
|||
|
|
# |<a_0, b_0>|² — top-1 eigenvector overlap
|
|||
|
|
top1_overlap.append(float((A[0] @ B[0]) ** 2))
|
|||
|
|
print(f" {L:>3} {np.mean(mean_cos_pa_per_h):>10.4f} "
|
|||
|
|
f"{np.mean(min_cos_pa_per_h):>10.4f} "
|
|||
|
|
f"{np.mean(top1_overlap):>10.4f}")
|
|||
|
|
|
|||
|
|
# ==========================================================
|
|||
|
|
# Verdict
|
|||
|
|
# ==========================================================
|
|||
|
|
to_anchor_per_head = np.array([
|
|||
|
|
(gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)).mean()
|
|||
|
|
for L in range(num_layers)
|
|||
|
|
])
|
|||
|
|
mean_cos_to_anchor = to_anchor_per_head.mean()
|
|||
|
|
print(f"\n=== Verdict ===")
|
|||
|
|
print(f" mean operator cos-sim to anchor: {mean_cos_to_anchor:+.4f}")
|
|||
|
|
adj_mean = adj.mean()
|
|||
|
|
print(f" mean operator cos-sim adjacent layers: {adj_mean:+.4f}")
|
|||
|
|
if mean_cos_to_anchor > 0.9:
|
|||
|
|
print(" STRONG: same operator up to scalar across all layers.")
|
|||
|
|
elif mean_cos_to_anchor > 0.5:
|
|||
|
|
print(" MEDIUM: substantial shared operator, but layer-specific drift.")
|
|||
|
|
elif mean_cos_to_anchor > 0.1:
|
|||
|
|
print(" WEAK: some alignment; far from single-operator interpretation.")
|
|||
|
|
else:
|
|||
|
|
print(" REJECTED: operators are effectively orthogonal across layers.")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|