forked from kent/consciousness
replace try_lock() with lock_blocking() across UI thread
Add lock_blocking() to TrackedMutex: blocks current thread using block_in_place + futures::executor::block_on, safe for sync contexts. Replace all try_lock() calls with lock_blocking() in slash commands, UI rendering, and status reads. Lock hold times are fast enough that blocking briefly is fine, and this eliminates the spurious 'lock unavailable' paths that were never actually hit. Kept rx_mutex.try_lock() in mod.rs (std::sync::Mutex for stderr rx).
This commit is contained in:
parent
5210f7dd66
commit
4225294d16
28 changed files with 4199 additions and 67 deletions
114
sa-schedule-geometry-analyze.py
Normal file
114
sa-schedule-geometry-analyze.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
"""What does per-head T (entropy) correlate with geometrically?
|
||||
|
||||
For each (layer, head) we already have singular values of the metric M^h = W_K^h^T W_Q^h
|
||||
(up to the low-rank structure — strictly SVD of the head_dim x head_dim product). Derive
|
||||
richer per-head geometric descriptors and test which ones predict dynamic entropy.
|
||||
|
||||
Descriptors per head:
|
||||
op_norm σ_max — global "capacity for sharpness"
|
||||
fro_norm √Σ σ_i² — total metric "energy"
|
||||
rank_eff Σσ / σ_max — effective number of modes
|
||||
spec_entropy -Σ (σ_i² / Σσ_j²) log(...) — flatness of spectrum (nats)
|
||||
anisotropy σ_max / σ_mean — how "peaked" the top mode is
|
||||
condition σ_max / σ_min — ratio of biggest to smallest
|
||||
trace Σσ_i — sum of modes (L1-like)
|
||||
|
||||
Correlate each of these per-head descriptors against per-head dynamic entropy, across
|
||||
all (layer, head) pairs. Also stratified by layer-position (early/mid/late).
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
|
||||
def compute_per_head_geometry(singvals_list):
|
||||
"""singvals_list: list per head of list of singular values. Returns dict of arrays."""
|
||||
s_all = [np.array(s, dtype=np.float64) for s in singvals_list]
|
||||
op = np.array([s.max() for s in s_all])
|
||||
fro = np.array([np.sqrt((s ** 2).sum()) for s in s_all])
|
||||
trace = np.array([s.sum() for s in s_all])
|
||||
rank_eff = np.array([s.sum() / max(s.max(), 1e-12) for s in s_all])
|
||||
# Spectral entropy: use normalized σ² as probabilities
|
||||
spec_ent = np.zeros(len(s_all))
|
||||
for i, s in enumerate(s_all):
|
||||
p = (s ** 2) / max((s ** 2).sum(), 1e-12)
|
||||
p = np.clip(p, 1e-12, 1.0)
|
||||
spec_ent[i] = float(-(p * np.log(p)).sum())
|
||||
anis = np.array([s.max() / max(s.mean(), 1e-12) for s in s_all])
|
||||
cond = np.array([s.max() / max(s.min(), 1e-12) for s in s_all])
|
||||
return dict(op=op, fro=fro, trace=trace, rank_eff=rank_eff,
|
||||
spec_ent=spec_ent, anisotropy=anis, condition=cond)
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("input_json")
|
||||
args = ap.parse_args()
|
||||
|
||||
with open(args.input_json) as f:
|
||||
data = json.load(f)
|
||||
|
||||
num_layers = data["num_layers"]
|
||||
num_heads = data["num_heads"]
|
||||
|
||||
# Entropy per (layer, head)
|
||||
ent = np.array([row["mean_attention_entropy_per_head"] for row in data["dynamic"]]) # (L, H)
|
||||
logit_std = np.array([row["mean_logit_std_per_head"] for row in data["dynamic"]]) # (L, H)
|
||||
|
||||
# Geometric descriptors per (layer, head)
|
||||
geom = {k: np.zeros((num_layers, num_heads)) for k in
|
||||
["op", "fro", "trace", "rank_eff", "spec_ent", "anisotropy", "condition"]}
|
||||
for L, row in enumerate(data["static"]):
|
||||
per_head = compute_per_head_geometry(row["metric_singvals_per_head"])
|
||||
for k, v in per_head.items():
|
||||
geom[k][L] = v
|
||||
|
||||
# Flatten across (layer, head) and correlate
|
||||
print("All (layer, head) pairs — Pearson correlation with dynamic entropy:")
|
||||
ent_flat = ent.flatten()
|
||||
logit_flat = logit_std.flatten()
|
||||
results = {}
|
||||
for k, v in geom.items():
|
||||
v_flat = v.flatten()
|
||||
c_ent = float(np.corrcoef(v_flat, ent_flat)[0, 1])
|
||||
c_logit = float(np.corrcoef(v_flat, logit_flat)[0, 1])
|
||||
results[k] = (c_ent, c_logit)
|
||||
print(f" {k:12} vs entropy: {c_ent:+.3f} vs logit_std: {c_logit:+.3f}")
|
||||
|
||||
# Stratify by layer position — early (0-11), mid (12-23), late (24-35)
|
||||
thirds = [(0, num_layers // 3, "early"),
|
||||
(num_layers // 3, 2 * num_layers // 3, "mid"),
|
||||
(2 * num_layers // 3, num_layers, "late")]
|
||||
print("\nStratified by layer position (entropy correlation):")
|
||||
for lo, hi, name in thirds:
|
||||
print(f" [{name} L{lo}-{hi-1}]", end="")
|
||||
for k in ["op", "fro", "rank_eff", "spec_ent", "anisotropy", "condition"]:
|
||||
c = float(np.corrcoef(geom[k][lo:hi].flatten(), ent[lo:hi].flatten())[0, 1])
|
||||
print(f" {k}:{c:+.2f}", end="")
|
||||
print()
|
||||
|
||||
# Best single predictor across all
|
||||
print("\nBest single geometric predictor of entropy (abs):")
|
||||
best = max(results.items(), key=lambda kv: abs(kv[1][0]))
|
||||
print(f" {best[0]} r = {best[1][0]:+.3f}")
|
||||
|
||||
# Multi-regression: try op, spec_ent, rank_eff jointly
|
||||
print("\nLinear regression of entropy on multiple descriptors (standardized):")
|
||||
from numpy.linalg import lstsq
|
||||
X_cols = ["op", "spec_ent", "rank_eff", "anisotropy"]
|
||||
X = np.stack([geom[k].flatten() for k in X_cols], axis=1)
|
||||
# standardize
|
||||
X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-12)
|
||||
y = (ent_flat - ent_flat.mean()) / (ent_flat.std() + 1e-12)
|
||||
X1 = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
|
||||
coef, res, rk, sv = lstsq(X1, y, rcond=None)
|
||||
y_pred = X1 @ coef
|
||||
r2 = 1 - float(((y - y_pred) ** 2).sum() / ((y - y.mean()) ** 2).sum())
|
||||
print(f" R² = {r2:.3f}")
|
||||
print(f" standardized coefficients:")
|
||||
for name, c in zip(X_cols, coef[:-1]):
|
||||
print(f" {name:12} {c:+.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue