forked from kent/consciousness
replace try_lock() with lock_blocking() across UI thread
Add lock_blocking() to TrackedMutex: blocks current thread using block_in_place + futures::executor::block_on, safe for sync contexts. Replace all try_lock() calls with lock_blocking() in slash commands, UI rendering, and status reads. Lock hold times are fast enough that blocking briefly is fine, and this eliminates the spurious 'lock unavailable' paths that were never actually hit. Kept rx_mutex.try_lock() in mod.rs (std::sync::Mutex for stderr rx).
This commit is contained in:
parent
5210f7dd66
commit
4225294d16
28 changed files with 4199 additions and 67 deletions
122
sa-schedule-gamma-directions.py
Normal file
122
sa-schedule-gamma-directions.py
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
"""Pull input_layernorm.γ vectors from a model and analyze direction
|
||||
structure across layers.
|
||||
|
||||
Question: is γ just scalar magnitude (isotropic SA) or does each layer
|
||||
have a preferred direction (anisotropic SA / geometry-aware)?
|
||||
|
||||
Decomposition: γ_L = ||γ_L|| · γ_L̂
|
||||
- ||γ_L|| is what our scalar Kirkpatrick fit captured
|
||||
- γ_L̂ is unit direction — if layers share direction, γ is rank-1 +
|
||||
scaling (classical isotropic). If directions differ per layer, γ
|
||||
encodes per-layer preferred axis (anisotropic).
|
||||
|
||||
We also look at:
|
||||
- pairwise cos-sim between γ_L̂ across layers
|
||||
- principal components of [γ_L̂]_L (stacked matrix)
|
||||
- per-phase structure: is Phase E more anisotropic than Phase C?
|
||||
"""
|
||||
import argparse
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--model", default="Qwen/Qwen3-32B")
|
||||
ap.add_argument("--out", default="/tmp/gamma-dirs.json")
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f"Loading {args.model} (CPU, layernorm params only)...", flush=True)
|
||||
m = AutoModelForCausalLM.from_pretrained(
|
||||
args.model, torch_dtype=torch.float32, device_map="cpu",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
num_layers = m.config.num_hidden_layers
|
||||
hidden = m.config.hidden_size
|
||||
print(f" L={num_layers}, hidden={hidden}", flush=True)
|
||||
|
||||
gammas = np.stack([
|
||||
m.model.layers[L].input_layernorm.weight.detach().float().cpu().numpy()
|
||||
for L in range(num_layers)
|
||||
]) # (L, hidden)
|
||||
del m
|
||||
|
||||
norms = np.linalg.norm(gammas, axis=1)
|
||||
units = gammas / norms[:, None]
|
||||
|
||||
# Pairwise cos-sim of unit γ
|
||||
cos_mat = units @ units.T # (L, L)
|
||||
|
||||
# PCA on unit vectors
|
||||
centered = units - units.mean(axis=0, keepdims=True)
|
||||
_, S, Vt = np.linalg.svd(centered, full_matrices=False)
|
||||
explained = S**2 / (S**2).sum()
|
||||
|
||||
# How much of each γ_L unit is explained by top-1 direction (shared)?
|
||||
top1 = Vt[0] # (hidden,)
|
||||
proj_top1 = units @ top1 # (L,)
|
||||
residual_after_top1 = np.sqrt(np.maximum(1 - proj_top1**2, 0))
|
||||
|
||||
# Per-phase summary (Qwen3-32B boundaries)
|
||||
def phase(L):
|
||||
if L <= 6: return "A"
|
||||
if L <= 9: return "B"
|
||||
if L <= 31: return "C"
|
||||
if L <= 46: return "D"
|
||||
if L <= 58: return "E"
|
||||
return "tail"
|
||||
|
||||
phase_ls = {}
|
||||
for L in range(num_layers):
|
||||
phase_ls.setdefault(phase(L), []).append(L)
|
||||
|
||||
print(f"\n=== ||γ_L|| per layer (scalar magnitude) ===")
|
||||
for L in range(num_layers):
|
||||
print(f" L={L:>2} phase={phase(L):>5} ||γ||={norms[L]:>8.3f} "
|
||||
f"proj_top1={proj_top1[L]:>+.4f} resid={residual_after_top1[L]:>.4f}")
|
||||
|
||||
print(f"\n=== PCA of unit γ vectors (direction structure) ===")
|
||||
print(f" Explained variance, top 10 components:")
|
||||
for i in range(min(10, len(S))):
|
||||
print(f" PC{i}: {explained[i]:.4f} (singular_val={S[i]:.4f})")
|
||||
print(f" Top-3 explain: {explained[:3].sum():.4f}")
|
||||
print(f" Top-10 explain: {explained[:10].sum():.4f}")
|
||||
|
||||
print(f"\n=== Per-phase direction statistics ===")
|
||||
print(f" {'phase':>6} {'N':>3} {'||γ||_mean':>10} {'||γ||_std':>9} "
|
||||
f"{'intra_cos':>9} {'vs_other_cos':>12}")
|
||||
for ph, Ls in phase_ls.items():
|
||||
u = units[Ls]
|
||||
intra = (u @ u.T)[np.triu_indices(len(Ls), k=1)]
|
||||
intra_mean = intra.mean() if len(intra) > 0 else 1.0
|
||||
# Vs other phases
|
||||
other_Ls = [L for L in range(num_layers) if L not in Ls]
|
||||
if other_Ls:
|
||||
u_other = units[other_Ls]
|
||||
vs = u @ u_other.T
|
||||
vs_mean = vs.mean()
|
||||
else:
|
||||
vs_mean = 0.0
|
||||
print(f" {ph:>6} {len(Ls):>3} {norms[Ls].mean():>10.3f} "
|
||||
f"{norms[Ls].std():>9.3f} {intra_mean:>+9.4f} {vs_mean:>+12.4f}")
|
||||
|
||||
print(f"\n=== Adjacent-pair unit-γ cos-sim ===")
|
||||
for L in range(num_layers - 1):
|
||||
print(f" L={L:>2}→{L+1:>2} phase={phase(L):>5} cos={cos_mat[L, L+1]:>+.4f}")
|
||||
|
||||
import json
|
||||
with open(args.out, "w") as f:
|
||||
json.dump({
|
||||
"model": args.model,
|
||||
"num_layers": num_layers,
|
||||
"norms": norms.tolist(),
|
||||
"proj_top1": proj_top1.tolist(),
|
||||
"explained_var": explained.tolist(),
|
||||
"cos_adjacent": [float(cos_mat[L, L+1]) for L in range(num_layers - 1)],
|
||||
}, f, indent=2)
|
||||
print(f"\nSaved: {args.out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue