amygdala: report eigenvalue spectrum for subspace method

When --method subspace, record top-20 eigenvalues of (M_pos - M_base)
per concept per layer. Added to quality.json as 'subspace_eigvals'.

Tells us whether the concept lives in a single dominant direction
(λ_0 >> λ_1, top-eigenvector is enough) or a spread of shared common
directions (λ_0 ≈ λ_1, top-1 loses signal).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-18 21:33:48 -04:00
parent fe0fb8253a
commit 974c6c7fd2

View file

@ -1001,6 +1001,9 @@ def main() -> None:
# and use sum-of-projection-operators per concept. -------------------- # and use sum-of-projection-operators per concept. --------------------
pos_subspaces: list[dict[int, torch.Tensor]] | None = None pos_subspaces: list[dict[int, torch.Tensor]] | None = None
base_subspaces: list[dict[int, torch.Tensor]] | None = None base_subspaces: list[dict[int, torch.Tensor]] | None = None
# Per (concept, layer): top-20 eigenvalues of (M_pos - M_base), descending.
# Populated only when --method subspace.
subspace_eigvals: dict[str, dict[int, list[float]]] = {}
if args.method == "subspace": if args.method == "subspace":
print("\nCollecting per-story subspaces (SVD, top-k right singular " print("\nCollecting per-story subspaces (SVD, top-k right singular "
f"vectors, k={args.subspace_k})...") f"vectors, k={args.subspace_k})...")
@ -1035,10 +1038,14 @@ def main() -> None:
pos_V = [pos_subspaces[j][target_l] for j in pos_rows] pos_V = [pos_subspaces[j][target_l] for j in pos_rows]
base_V = [pos_subspaces[j][target_l] for j in neg_rows] base_V = [pos_subspaces[j][target_l] for j in neg_rows]
base_V += [bs[target_l] for bs in (base_subspaces or [])] base_V += [bs[target_l] for bs in (base_subspaces or [])]
top_vec, _eigvals = _subspace_concept_direction( top_vec, eigvals = _subspace_concept_direction(
pos_V, base_V, hidden=hidden_dim, pos_V, base_V, hidden=hidden_dim,
) )
per_layer_vectors[l_idx, e_idx] = top_vec per_layer_vectors[l_idx, e_idx] = top_vec
# Keep the top-20 eigenvalues for quality-report diagnostics.
subspace_eigvals.setdefault(emotion, {})[target_l] = (
eigvals[-20:].flip(0).tolist()
)
else: else:
pos = positive_acts[pos_rows] # [n_pos, n_layers, hidden] pos = positive_acts[pos_rows] # [n_pos, n_layers, hidden]
neg = positive_acts[neg_rows] # [n_neg, n_layers, hidden] neg = positive_acts[neg_rows] # [n_neg, n_layers, hidden]
@ -1149,6 +1156,17 @@ def main() -> None:
"per-head analysis skipped." "per-head analysis skipped."
) )
# Eigenvalue spectrum from the subspace method — if populated, report
# the top-20 eigenvalues per concept per layer. Tells us whether the
# concept direction lives in a single dominant dimension (λ_0 >> λ_1)
# or a spread of common directions (λ_0 ≈ λ_1 ≈ ...).
if subspace_eigvals:
for emotion, per_l in subspace_eigvals.items():
if emotion in report:
report[emotion]["subspace_eigvals"] = {
str(l): vals for l, vals in per_l.items()
}
# Linear combinations — for each concept, how much of its direction # Linear combinations — for each concept, how much of its direction
# is explained by a ridge regression on the others. R² > 0.9 flags # is explained by a ridge regression on the others. R² > 0.9 flags
# concepts that are essentially linear combinations of their peers # concepts that are essentially linear combinations of their peers