forked from kent/consciousness
replace try_lock() with lock_blocking() across UI thread
Add lock_blocking() to TrackedMutex: blocks current thread using block_in_place + futures::executor::block_on, safe for sync contexts. Replace all try_lock() calls with lock_blocking() in slash commands, UI rendering, and status reads. Lock hold times are fast enough that blocking briefly is fine, and this eliminates the spurious 'lock unavailable' paths that were never actually hit. Kept rx_mutex.try_lock() in mod.rs (std::sync::Mutex for stderr rx).
This commit is contained in:
parent
5210f7dd66
commit
4225294d16
28 changed files with 4199 additions and 67 deletions
145
sa-schedule-fit-gamma.py
Normal file
145
sa-schedule-fit-gamma.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
"""Fit a functional form to the LN γ trajectory across layers; derive the
|
||||
effective attention temperature T(L) from known coupling formulas.
|
||||
|
||||
Rules of what scales with depth (from literature):
|
||||
DeepNorm: α_dec = (2M)^(1/4), β_dec = (8M)^(-1/4). Same per layer — does
|
||||
NOT depend on layer index l. The free variation across layers has to
|
||||
live in LN γ.
|
||||
Depth-μP: block multiplier a/√L, LR η/√L. Same per layer.
|
||||
So γ(L) is the family carrying the per-layer schedule.
|
||||
|
||||
Try fitting forms:
|
||||
γ(L) = a · L^b (power law in layer index)
|
||||
γ(L) = a · exp(b·L) (exponential)
|
||||
γ(L) = a + b·L (linear)
|
||||
γ(L) = a + b·L^c (free c) (power law with free exponent)
|
||||
|
||||
Report fit quality (R², residual statistics), and for the best fit, compute
|
||||
the derived T(L) curve.
|
||||
"""
|
||||
import json
|
||||
import numpy as np
|
||||
from math import log, exp
|
||||
|
||||
|
||||
def fit_power(L, y):
|
||||
"""y ≈ a · L^b → log y ≈ log a + b log L."""
|
||||
mask = (L > 0) & (y > 0)
|
||||
lx, ly = np.log(L[mask]), np.log(y[mask])
|
||||
b, loga = np.polyfit(lx, ly, 1)
|
||||
yhat = np.exp(loga) * (L**b)
|
||||
r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
|
||||
return {"form": "a*L^b", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat}
|
||||
|
||||
|
||||
def fit_exponential(L, y):
|
||||
"""y ≈ a · exp(b·L) → log y ≈ log a + b·L."""
|
||||
mask = y > 0
|
||||
b, loga = np.polyfit(L[mask], np.log(y[mask]), 1)
|
||||
yhat = np.exp(loga) * np.exp(b * L)
|
||||
r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
|
||||
return {"form": "a*exp(b*L)", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat}
|
||||
|
||||
|
||||
def fit_linear(L, y):
|
||||
b, a = np.polyfit(L, y, 1)
|
||||
yhat = a + b * L
|
||||
r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
|
||||
return {"form": "a+b*L", "a": float(a), "b": float(b), "r2": float(r2), "yhat": yhat}
|
||||
|
||||
|
||||
def fit_piecewise_two(L, y):
|
||||
"""Best split point L* and linear fits on each half (log-space)."""
|
||||
best = None
|
||||
for Ls in range(3, len(L) - 3):
|
||||
mA, mB = L < Ls, L >= Ls
|
||||
if (y[mA] <= 0).any() or (y[mB] <= 0).any():
|
||||
continue
|
||||
bA, aA = np.polyfit(L[mA], np.log(y[mA]), 1)
|
||||
bB, aB = np.polyfit(L[mB], np.log(y[mB]), 1)
|
||||
yhat = np.where(mA, np.exp(aA + bA * L), np.exp(aB + bB * L))
|
||||
r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum()
|
||||
if best is None or r2 > best["r2"]:
|
||||
best = {"form": f"piecewise-exp-split@L={Ls}", "split": int(Ls),
|
||||
"a1": float(np.exp(aA)), "b1": float(bA),
|
||||
"a2": float(np.exp(aB)), "b2": float(bB),
|
||||
"r2": float(r2), "yhat": yhat}
|
||||
return best
|
||||
|
||||
|
||||
def main():
|
||||
d = json.load(open("/tmp/qwen3-4b-null.json"))
|
||||
scales = d["scales"]
|
||||
num_layers = len(scales["input_ln"])
|
||||
L = np.arange(num_layers, dtype=float)
|
||||
|
||||
families_of_interest = ["input_ln", "post_attn_ln", "q_norm", "k_norm",
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj"]
|
||||
|
||||
print("=" * 72)
|
||||
print("γ-trajectory fits per family (Qwen3-4B, 36 layers)")
|
||||
print("=" * 72)
|
||||
|
||||
for fam in families_of_interest:
|
||||
y = np.array(scales[fam], dtype=float)
|
||||
print(f"\n--- {fam} ---")
|
||||
print(f" L=0: {y[0]:.3f} L=35: {y[-1]:.3f} ratio: {y[-1]/y[0]:+.2f}×")
|
||||
fits = [
|
||||
fit_linear(L, y),
|
||||
fit_power(L + 1, y), # L+1 so L=0 doesn't explode log
|
||||
fit_exponential(L, y),
|
||||
fit_piecewise_two(L + 1, y),
|
||||
]
|
||||
for f in fits:
|
||||
if f is None:
|
||||
continue
|
||||
extras = ""
|
||||
if "b" in f:
|
||||
extras = f" (a={f['a']:.3g}, b={f['b']:+.4f})"
|
||||
elif "split" in f:
|
||||
extras = f" (split={f['split']}, b1={f['b1']:+.4f}, b2={f['b2']:+.4f})"
|
||||
print(f" {f['form']:<32} R²={f['r2']:+.4f}{extras}")
|
||||
|
||||
# For input_ln specifically: plot the curve (text) and derive T(L)
|
||||
y = np.array(scales["input_ln"], dtype=float)
|
||||
print("\n" + "=" * 72)
|
||||
print("input_ln γ magnitude across layers (the schedule signal)")
|
||||
print("=" * 72)
|
||||
print(f" {'L':>3} {'γ_L':>12} {'γ_L / γ_0':>10} {'log γ_L':>10}")
|
||||
for l_idx in range(num_layers):
|
||||
print(f" {l_idx:>3} {y[l_idx]:>12.3f} {y[l_idx]/y[0]:>10.3f} {log(y[l_idx]):>+10.4f}")
|
||||
|
||||
# Classical SA schedules for comparison
|
||||
# - Linear: T(k) = T0 - k * (T0 - Tf)/N
|
||||
# - Exponential / Kirkpatrick: T(k) = T0 * α^k
|
||||
# - Logarithmic / Hajek: T(k) = c / log(k+2)
|
||||
# For γ (which grows = temperature drops, since larger γ → sharper attention):
|
||||
# γ growing corresponds to T cooling
|
||||
print("\n" + "=" * 72)
|
||||
print("Derived attention-temperature T(L) interpretation")
|
||||
print("=" * 72)
|
||||
print(" Attention logit ∝ (γ * W_Q * W_K * ||residual||²) / √d_head.")
|
||||
print(" With γ_L the schedule dial and other factors ~constant across layers,")
|
||||
print(" effective attention temperature T(L) ∝ 1/γ(L).")
|
||||
print(f"\n T(L)/T(0) = γ(0)/γ(L):")
|
||||
print(f" {'L':>3} {'T(L)/T(0)':>10} (smaller = cooler = sharper attention)")
|
||||
for l_idx in range(num_layers):
|
||||
print(f" {l_idx:>3} {y[0]/y[l_idx]:>10.4f}")
|
||||
|
||||
# Comparison with classical SA cooling laws:
|
||||
# Kirkpatrick: T(L) = T0 · α^L → log T(L) = log T0 + L log α
|
||||
logT = -np.log(y / y[0]) # because T ∝ 1/γ
|
||||
b_kirk, a_kirk = np.polyfit(L, logT, 1)
|
||||
# Hajek (log-cooling): T(L) = c/log(L+2)
|
||||
# Predicts: log T = log c - log(log(L+2))
|
||||
# Fit T(L) to c / log(L+c2)
|
||||
print(f"\n Kirkpatrick-law fit (exponential cooling):")
|
||||
print(f" log T(L) = {a_kirk:+.3f} + {b_kirk:+.4f} * L → T(L) = exp({a_kirk:+.3f}) · exp({b_kirk:+.4f}·L)")
|
||||
logT_hat = a_kirk + b_kirk * L
|
||||
r2_kirk = 1 - ((logT - logT_hat)**2).sum() / ((logT - logT.mean())**2).sum()
|
||||
print(f" R² (in log space) = {r2_kirk:+.4f} — ideally ≈ 1 if cooling is pure exponential")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue