forked from kent/consciousness
Add lock_blocking() to TrackedMutex: blocks current thread using block_in_place + futures::executor::block_on, safe for sync contexts. Replace all try_lock() calls with lock_blocking() in slash commands, UI rendering, and status reads. Lock hold times are fast enough that blocking briefly is fine, and this eliminates the spurious 'lock unavailable' paths that were never actually hit. Kept rx_mutex.try_lock() in mod.rs (std::sync::Mutex for stderr rx).
237 lines
9 KiB
Python
237 lines
9 KiB
Python
"""Null test: before any fitting, how similar are adjacent layers in the
|
|
raw weight-matrix sense?
|
|
|
|
For each adjacent layer pair (L, L+1) and each parameter family:
|
|
1. Normalize each matrix by its Frobenius norm (unit sphere).
|
|
2. Compute cos-sim = <W_L, W_{L+1}> / (||W_L|| ||W_{L+1}||).
|
|
3. Compute residual Δ = W_{L+1,norm} - W_{L,norm}; report ||Δ||_F
|
|
(null-if-orthogonal = sqrt(2) ≈ 1.414; null-if-identical = 0).
|
|
4. Report effective rank of Δ (via entropy of normalized spectrum).
|
|
|
|
Whole network, not just middle block. Plots cos-sim and residual-rank
|
|
trajectories across depth.
|
|
"""
|
|
import argparse
|
|
import json
|
|
import numpy as np
|
|
import torch
|
|
from transformers import AutoModelForCausalLM
|
|
|
|
|
|
def spec_entropy(singvals, eps=1e-12):
|
|
p = (singvals ** 2)
|
|
p = p / max(p.sum(), eps)
|
|
p = np.clip(p, eps, 1.0)
|
|
return float(-(p * np.log(p)).sum())
|
|
|
|
|
|
def frob(x):
|
|
return float(np.linalg.norm(x))
|
|
|
|
|
|
def norm_mat(x, eps=1e-12):
|
|
return x / max(frob(x), eps)
|
|
|
|
|
|
def null_test_pair(A_dict, B_dict, family_names, num_heads, num_kv_heads, head_dim):
|
|
"""For each family, compute cos-sim and normalized residual between
|
|
adjacent layers. Returns dict of per-family stats."""
|
|
out = {}
|
|
for fam in family_names:
|
|
if fam not in A_dict or fam not in B_dict:
|
|
continue
|
|
Wa = A_dict[fam]
|
|
Wb = B_dict[fam]
|
|
if Wa.shape != Wb.shape:
|
|
continue
|
|
fa = frob(Wa)
|
|
fb = frob(Wb)
|
|
if fa < 1e-12 or fb < 1e-12:
|
|
continue
|
|
cos = float((Wa * Wb).sum() / (fa * fb))
|
|
resid_norm_sq = 2.0 - 2.0 * cos # ||Wa/|| - Wb/|| ||^2
|
|
resid_norm = float(np.sqrt(max(resid_norm_sq, 0.0)))
|
|
|
|
# Skip residual SVD — was bottleneck on large matrices; cos-sim
|
|
# + scalar fit give us the main signal. Can add back selectively.
|
|
eff_rank = None
|
|
se = None
|
|
|
|
out[fam] = {
|
|
"cos": cos,
|
|
"resid_norm": resid_norm,
|
|
"resid_eff_rank": eff_rank,
|
|
"resid_spec_entropy": se,
|
|
}
|
|
return out
|
|
|
|
|
|
@torch.no_grad()
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--model", default="Qwen/Qwen3-4B")
|
|
ap.add_argument("--out", default="/tmp/sa-null-residual.json")
|
|
args = ap.parse_args()
|
|
|
|
print(f"Loading {args.model} ...", flush=True)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
args.model,
|
|
torch_dtype=torch.bfloat16, # halve memory vs fp32
|
|
device_map="cpu",
|
|
trust_remote_code=True,
|
|
attn_implementation="eager",
|
|
)
|
|
cfg = model.config
|
|
num_layers = cfg.num_hidden_layers
|
|
num_heads = cfg.num_attention_heads
|
|
num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads)
|
|
hidden = cfg.hidden_size
|
|
head_dim = getattr(cfg, "head_dim", hidden // num_heads)
|
|
intermediate = cfg.intermediate_size
|
|
print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} "
|
|
f"hidden={hidden} ff={intermediate}", flush=True)
|
|
|
|
families = ["q_proj", "k_proj", "v_proj", "o_proj",
|
|
"gate_proj", "up_proj", "down_proj",
|
|
"input_ln", "post_attn_ln", "q_norm", "k_norm"]
|
|
|
|
layers = {}
|
|
for L in range(num_layers):
|
|
layer = model.model.layers[L]
|
|
attn = layer.self_attn
|
|
mlp = layer.mlp
|
|
entry = {
|
|
"q_proj": attn.q_proj.weight.detach().float().numpy(),
|
|
"k_proj": attn.k_proj.weight.detach().float().numpy(),
|
|
"v_proj": attn.v_proj.weight.detach().float().numpy(),
|
|
"o_proj": attn.o_proj.weight.detach().float().numpy(),
|
|
"gate_proj": mlp.gate_proj.weight.detach().float().numpy(),
|
|
"up_proj": mlp.up_proj.weight.detach().float().numpy(),
|
|
"down_proj": mlp.down_proj.weight.detach().float().numpy(),
|
|
"input_ln": layer.input_layernorm.weight.detach().float().numpy(),
|
|
"post_attn_ln": layer.post_attention_layernorm.weight.detach().float().numpy(),
|
|
}
|
|
qn = getattr(attn, "q_norm", None)
|
|
kn = getattr(attn, "k_norm", None)
|
|
if qn is not None:
|
|
entry["q_norm"] = qn.weight.detach().float().numpy()
|
|
if kn is not None:
|
|
entry["k_norm"] = kn.weight.detach().float().numpy()
|
|
layers[L] = entry
|
|
|
|
del model
|
|
|
|
# Also record per-layer scale (Frobenius norm) for the scale-track PCA
|
|
scales = {fam: [] for fam in families}
|
|
for L in range(num_layers):
|
|
for fam in families:
|
|
if fam in layers[L]:
|
|
scales[fam].append(frob(layers[L][fam]))
|
|
else:
|
|
scales[fam].append(None)
|
|
|
|
# Pairwise null test
|
|
pair_results = []
|
|
for L in range(num_layers - 1):
|
|
r = null_test_pair(layers[L], layers[L + 1], families,
|
|
num_heads, num_kv_heads, head_dim)
|
|
pair_results.append({"L": L, "L_next": L + 1, "families": r})
|
|
|
|
# Report
|
|
print("\n=== Adjacent-layer raw cos-sim per family ===")
|
|
print(" null interpretation: 1.0 = identical matrices up to scale, 0 = orthogonal")
|
|
print(f"\n {'L':>3}", end="")
|
|
for fam in families:
|
|
if any(fam in pr["families"] for pr in pair_results):
|
|
print(f" {fam:>12}", end="")
|
|
print()
|
|
for pr in pair_results:
|
|
print(f" {pr['L']:>3}", end="")
|
|
for fam in families:
|
|
if fam in pr["families"]:
|
|
print(f" {pr['families'][fam]['cos']:>+12.4f}", end="")
|
|
else:
|
|
print(f" {'':>12}", end="")
|
|
print()
|
|
|
|
# Summary per family + scalar-T fit comparison
|
|
# raw_resid = sqrt(2 - 2*cos); scalar_fit = sqrt(1 - cos²) = sin(angle).
|
|
# random_baseline = sqrt(2) ≈ 1.414.
|
|
print("\n=== Per-family summary (across all adjacent pairs) ===")
|
|
print(" random baseline = sqrt(2) ≈ 1.414 (what we'd see with no relationship)")
|
|
print(f"\n {'family':>14} {'mean_cos':>10} {'median_cos':>11} "
|
|
f"{'raw_resid':>10} {'scalar_fit':>11} {'improve_frac':>13} {'mean_SE':>8}")
|
|
for fam in families:
|
|
cs = [pr["families"].get(fam, {}).get("cos") for pr in pair_results]
|
|
cs = [x for x in cs if x is not None]
|
|
rs = [pr["families"].get(fam, {}).get("resid_norm") for pr in pair_results]
|
|
rs = [x for x in rs if x is not None]
|
|
ers = [pr["families"].get(fam, {}).get("resid_eff_rank") for pr in pair_results]
|
|
ers = [x for x in ers if x is not None]
|
|
ses = [pr["families"].get(fam, {}).get("resid_spec_entropy") for pr in pair_results]
|
|
ses = [x for x in ses if x is not None]
|
|
if not cs:
|
|
continue
|
|
raw = np.sqrt(np.maximum(2.0 - 2.0 * np.array(cs), 0.0)).mean()
|
|
scalar_fit = np.sqrt(np.maximum(1.0 - np.array(cs) ** 2, 0.0)).mean()
|
|
# Improvement fraction: (raw - scalar_fit) / (raw - 0) normalized
|
|
# to [0, 1] where 0 = scalar does nothing, 1 = scalar reconstructs.
|
|
improve_frac = (raw - scalar_fit) / max(raw, 1e-12)
|
|
print(f" {fam:>14} {np.mean(cs):>+10.4f} {np.median(cs):>+11.4f} "
|
|
f"{raw:>10.4f} {scalar_fit:>11.4f} {improve_frac:>13.4f} "
|
|
f"{np.mean(ses) if ses else 0:>8.4f}")
|
|
|
|
# Scale-track: Frobenius norm of each family across layers
|
|
print("\n=== Scale track: ||W_family||_F across layers ===")
|
|
print(f" {'L':>3}", end="")
|
|
for fam in families:
|
|
if any(s is not None for s in scales[fam]):
|
|
print(f" {fam:>12}", end="")
|
|
print()
|
|
for L in range(num_layers):
|
|
print(f" {L:>3}", end="")
|
|
for fam in families:
|
|
if scales[fam][L] is not None:
|
|
print(f" {scales[fam][L]:>12.4f}", end="")
|
|
else:
|
|
print(f" {'':>12}", end="")
|
|
print()
|
|
|
|
# PCA of log-scale-track to see dimensionality of schedule
|
|
print("\n=== PCA of log-scale-track (dimensionality of schedule) ===")
|
|
scale_matrix = []
|
|
fam_used = []
|
|
for fam in families:
|
|
vals = scales[fam]
|
|
if all(v is not None for v in vals):
|
|
scale_matrix.append(np.log(np.array(vals)))
|
|
fam_used.append(fam)
|
|
scale_matrix = np.array(scale_matrix) # (num_families, L)
|
|
# Center per-family
|
|
sm_c = scale_matrix - scale_matrix.mean(axis=1, keepdims=True)
|
|
# SVD: columns are layers, rows are families
|
|
U, S, Vh = np.linalg.svd(sm_c, full_matrices=False)
|
|
total = (S ** 2).sum()
|
|
print(f" explained variance by mode:")
|
|
for i, s in enumerate(S):
|
|
pct = float(s ** 2 / max(total, 1e-20)) * 100
|
|
print(f" mode {i+1:>2}: {pct:>6.2f}% "
|
|
f"(loadings per family: "
|
|
f"{', '.join(f'{fam_used[j]}={U[j, i]:+.2f}' for j in range(len(fam_used)))})")
|
|
|
|
# Save
|
|
with open(args.out, "w") as f:
|
|
json.dump({
|
|
"model": args.model,
|
|
"pair_results": pair_results,
|
|
"scales": scales,
|
|
"scale_pca_singvals": S.tolist(),
|
|
"scale_pca_loadings": U.tolist(),
|
|
"scale_pca_scores": (np.diag(S) @ Vh).tolist(),
|
|
"fam_used": fam_used,
|
|
}, f, indent=2)
|
|
print(f"\nSaved: {args.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|