forked from kent/consciousness
replace try_lock() with lock_blocking() across UI thread
Add lock_blocking() to TrackedMutex: blocks current thread using block_in_place + futures::executor::block_on, safe for sync contexts. Replace all try_lock() calls with lock_blocking() in slash commands, UI rendering, and status reads. Lock hold times are fast enough that blocking briefly is fine, and this eliminates the spurious 'lock unavailable' paths that were never actually hit. Kept rx_mutex.try_lock() in mod.rs (std::sync::Mutex for stderr rx).
This commit is contained in:
parent
5210f7dd66
commit
4225294d16
28 changed files with 4199 additions and 67 deletions
168
sa-schedule-analyze-grams.py
Normal file
168
sa-schedule-analyze-grams.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""Analyze operator-level inter-layer alignment from the grams + eigdirs files.
|
||||
|
||||
Input:
|
||||
qwen3-4b-grams.json (gram[L,L',h], fro_sq[L,h])
|
||||
qwen3-4b-grams-eigdirs.pt (eig_dirs[L,h,topk,hidden], sym_eigs[L,h,2*head_dim])
|
||||
|
||||
Questions:
|
||||
(a) Operator cos-sim between layers. cos(g_L^h, g_L'^h) = gram / √(fro_sq fro_sq').
|
||||
If ~1 → same operator up to scalar. If low → distinct operators.
|
||||
(b) Scalar-rescale residual using full operator (not spectrum):
|
||||
optimal T = gram / fro_sq', residual_frac = √(1 - cos²).
|
||||
(c) Curvature-sign alignment. For each (L, anchor) pair, what fraction of
|
||||
top-k signed eigenvalues share sign with the anchor's?
|
||||
(d) Top-k eigensubspace alignment. Principal angles between span{eig_dirs_L}
|
||||
and span{eig_dirs_anchor}.
|
||||
|
||||
Compare: operator cos-sim vs spectral cos-sim (from prior analysis). The
|
||||
sheaf-rs finding was that spectral shape converges across layers while
|
||||
eigenvectors don't. We want to confirm/refute that within QK in Qwen3-4B.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("gram_json")
|
||||
ap.add_argument("--anchor", type=int, default=-1,
|
||||
help="anchor layer index; -1 = last")
|
||||
args = ap.parse_args()
|
||||
|
||||
with open(args.gram_json) as f:
|
||||
d = json.load(f)
|
||||
num_layers = d["num_layers"]
|
||||
num_heads = d["num_heads"]
|
||||
head_dim = d["head_dim"]
|
||||
hidden = d["hidden_size"]
|
||||
gram = np.array(d["gram"]) # (L, L, H)
|
||||
# NOTE: fro_sq from the json is ||W_K W_Q^T||_F^2 (the measure.py
|
||||
# shortcut), NOT ||g||_F^2 = ||W_K^T W_Q||_F^2 which is what the gram
|
||||
# diagonal gives. Different objects. Use gram diagonal for normalization.
|
||||
diag_sq = np.array([[gram[L, L, h] for h in range(num_heads)]
|
||||
for L in range(num_layers)]) # (L, H)
|
||||
diag = np.sqrt(np.maximum(diag_sq, 1e-20)) # ||g_L^h||_F
|
||||
|
||||
pt = torch.load(d.get("eigdirs_path", args.gram_json.replace(".json", "-eigdirs.pt")),
|
||||
weights_only=True)
|
||||
eig_dirs = pt["eig_dirs"].double().numpy() # (L, H, topk, hidden)
|
||||
sym_eigs = pt["sym_eigs"].double().numpy() # (L, H, 2*head_dim)
|
||||
topk = eig_dirs.shape[2]
|
||||
anchor = args.anchor if args.anchor >= 0 else num_layers - 1
|
||||
|
||||
# ==========================================================
|
||||
# (a) Operator cos-sim matrix, averaged over heads
|
||||
# ==========================================================
|
||||
cos_mat = np.zeros((num_layers, num_layers))
|
||||
for L in range(num_layers):
|
||||
for Lp in range(num_layers):
|
||||
denom = diag[L] * diag[Lp]
|
||||
per_h = gram[L, Lp] / np.maximum(denom, 1e-20)
|
||||
cos_mat[L, Lp] = per_h.mean()
|
||||
|
||||
print(f"=== (a) Operator cos-sim between layers, averaged over {num_heads} heads ===")
|
||||
print(f" diagonal (should be 1.0): mean {np.diag(cos_mat).mean():.4f}")
|
||||
# Adjacent-layer cos-sim
|
||||
adj = np.array([cos_mat[L, L+1] for L in range(num_layers-1)])
|
||||
print(f" adjacent layers cos-sim: mean {adj.mean():.4f} min {adj.min():.4f} max {adj.max():.4f}")
|
||||
# Layer-to-anchor cos-sim
|
||||
to_anchor = cos_mat[:, anchor]
|
||||
print(f" layer -> anchor L={anchor} cos-sim:")
|
||||
print(f" {'L':>3} {'cos':>7} {'T_opt':>7} {'resid_frac':>10}")
|
||||
for L in range(num_layers):
|
||||
c = to_anchor[L]
|
||||
T = float(np.mean(gram[L, anchor] / np.maximum(diag_sq[anchor], 1e-20)))
|
||||
r = float(np.sqrt(max(1.0 - c**2, 0.0)))
|
||||
print(f" {L:>3} {c:+.4f} {T:+7.3f} {r:>10.4f}")
|
||||
|
||||
# Long-range cos-sim (L=0 to L=35 vs L=17 to L=35 etc.)
|
||||
print(f"\n long-range: cos(L=0, last) = {cos_mat[0, -1]:+.3f} "
|
||||
f"cos(L=midish, last) = {cos_mat[num_layers//2, -1]:+.3f}")
|
||||
|
||||
# ==========================================================
|
||||
# (b) Full scalar-rescale residual using the gram
|
||||
# ==========================================================
|
||||
print(f"\n=== (b) Operator-level scalar rescale to anchor L={anchor} ===")
|
||||
# residual_frac² = 1 - cos²(g_L, g_anchor) (per head)
|
||||
print(f" {'L':>3} {'mean_cos':>9} {'mean_resid':>10}")
|
||||
for L in range(num_layers):
|
||||
per_h_cos = gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)
|
||||
per_h_resid = np.sqrt(np.clip(1.0 - per_h_cos**2, 0.0, 1.0))
|
||||
print(f" {L:>3} {per_h_cos.mean():>+9.4f} {per_h_resid.mean():>10.4f}")
|
||||
|
||||
# ==========================================================
|
||||
# (c) Curvature-sign alignment
|
||||
# ==========================================================
|
||||
print(f"\n=== (c) Curvature-sign alignment vs anchor L={anchor} ===")
|
||||
# Look at top-k eigenvalues by magnitude (already sorted that way in measure).
|
||||
# Fraction of top-k (L, h) whose sign matches the anchor's i-th eigenvalue.
|
||||
for k_use in [2, 4, 8, 16, 32, 64, 128, 256]:
|
||||
if k_use > sym_eigs.shape[-1]:
|
||||
continue
|
||||
# sign of top-k_use eigenvalues at layer L vs at anchor, per (L, h)
|
||||
sign_L = np.sign(sym_eigs[:, :, :k_use]) # (L, H, k_use)
|
||||
sign_a = np.sign(sym_eigs[anchor, :, :k_use]) # (H, k_use)
|
||||
agree = (sign_L == sign_a[None, :, :]).mean(axis=-1) # (L, H)
|
||||
print(f" top-{k_use:>3} signs: mean agree = {agree.mean():.3f} "
|
||||
f"by layer range: early {agree[:12].mean():.3f} "
|
||||
f"mid {agree[12:24].mean():.3f} late {agree[24:].mean():.3f}")
|
||||
|
||||
# Also: distribution of sign-balance per layer (fraction positive eigenvalues)
|
||||
frac_pos = (sym_eigs[:, :, :2 * head_dim] > 0).mean(axis=(1, 2))
|
||||
print(f"\n fraction positive eigenvalues per layer:")
|
||||
for L in range(num_layers):
|
||||
print(f" L={L:2} frac+ = {frac_pos[L]:.3f}")
|
||||
|
||||
# ==========================================================
|
||||
# (d) Eigenspace principal angles
|
||||
# ==========================================================
|
||||
print(f"\n=== (d) Top-{topk} eigensubspace principal angles vs anchor L={anchor} ===")
|
||||
# Per-head: cos of principal angles between row-spans of eig_dirs[L, h]
|
||||
# and eig_dirs[anchor, h]. Report mean cos angle per layer.
|
||||
print(f" {'L':>3} {'meanCosPA':>10} {'minCosPA':>10} {'max_top1':>10}")
|
||||
for L in range(num_layers):
|
||||
mean_cos_pa_per_h = []
|
||||
min_cos_pa_per_h = []
|
||||
top1_overlap = []
|
||||
for h in range(num_heads):
|
||||
A = eig_dirs[L, h] # (topk, hidden) rows are unit vectors
|
||||
B = eig_dirs[anchor, h] # (topk, hidden)
|
||||
# Orthonormalize rows (they're close-to-orthonormal but not exactly)
|
||||
Qa, _ = np.linalg.qr(A.T) # hidden × topk
|
||||
Qb, _ = np.linalg.qr(B.T)
|
||||
M = Qa.T @ Qb # topk × topk
|
||||
s = np.linalg.svd(M, compute_uv=False)
|
||||
mean_cos_pa_per_h.append(s.mean())
|
||||
min_cos_pa_per_h.append(s.min())
|
||||
# |<a_0, b_0>|² — top-1 eigenvector overlap
|
||||
top1_overlap.append(float((A[0] @ B[0]) ** 2))
|
||||
print(f" {L:>3} {np.mean(mean_cos_pa_per_h):>10.4f} "
|
||||
f"{np.mean(min_cos_pa_per_h):>10.4f} "
|
||||
f"{np.mean(top1_overlap):>10.4f}")
|
||||
|
||||
# ==========================================================
|
||||
# Verdict
|
||||
# ==========================================================
|
||||
to_anchor_per_head = np.array([
|
||||
(gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)).mean()
|
||||
for L in range(num_layers)
|
||||
])
|
||||
mean_cos_to_anchor = to_anchor_per_head.mean()
|
||||
print(f"\n=== Verdict ===")
|
||||
print(f" mean operator cos-sim to anchor: {mean_cos_to_anchor:+.4f}")
|
||||
adj_mean = adj.mean()
|
||||
print(f" mean operator cos-sim adjacent layers: {adj_mean:+.4f}")
|
||||
if mean_cos_to_anchor > 0.9:
|
||||
print(" STRONG: same operator up to scalar across all layers.")
|
||||
elif mean_cos_to_anchor > 0.5:
|
||||
print(" MEDIUM: substantial shared operator, but layer-specific drift.")
|
||||
elif mean_cos_to_anchor > 0.1:
|
||||
print(" WEAK: some alignment; far from single-operator interpretation.")
|
||||
else:
|
||||
print(" REJECTED: operators are effectively orthogonal across layers.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue