From 974c6c7fd2a5100cb96dc9dddc3fa672be5d6ec2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 18 Apr 2026 21:33:48 -0400 Subject: [PATCH] amygdala: report eigenvalue spectrum for subspace method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When --method subspace, record top-20 eigenvalues of (M_pos - M_base) per concept per layer. Added to quality.json as 'subspace_eigvals'. Tells us whether the concept lives in a single dominant direction (λ_0 >> λ_1, top-eigenvector is enough) or a spread of shared common directions (λ_0 ≈ λ_1, top-1 loses signal). Co-Authored-By: Proof of Concept --- .../train_steering_vectors.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/training/amygdala_training/train_steering_vectors.py b/training/amygdala_training/train_steering_vectors.py index ba8fa5d..b44df2c 100644 --- a/training/amygdala_training/train_steering_vectors.py +++ b/training/amygdala_training/train_steering_vectors.py @@ -1001,6 +1001,9 @@ def main() -> None: # and use sum-of-projection-operators per concept. -------------------- pos_subspaces: list[dict[int, torch.Tensor]] | None = None base_subspaces: list[dict[int, torch.Tensor]] | None = None + # Per (concept, layer): top-20 eigenvalues of (M_pos - M_base), descending. + # Populated only when --method subspace. + subspace_eigvals: dict[str, dict[int, list[float]]] = {} if args.method == "subspace": print("\nCollecting per-story subspaces (SVD, top-k right singular " f"vectors, k={args.subspace_k})...") @@ -1035,10 +1038,14 @@ def main() -> None: pos_V = [pos_subspaces[j][target_l] for j in pos_rows] base_V = [pos_subspaces[j][target_l] for j in neg_rows] base_V += [bs[target_l] for bs in (base_subspaces or [])] - top_vec, _eigvals = _subspace_concept_direction( + top_vec, eigvals = _subspace_concept_direction( pos_V, base_V, hidden=hidden_dim, ) per_layer_vectors[l_idx, e_idx] = top_vec + # Keep the top-20 eigenvalues for quality-report diagnostics. + subspace_eigvals.setdefault(emotion, {})[target_l] = ( + eigvals[-20:].flip(0).tolist() + ) else: pos = positive_acts[pos_rows] # [n_pos, n_layers, hidden] neg = positive_acts[neg_rows] # [n_neg, n_layers, hidden] @@ -1149,6 +1156,17 @@ def main() -> None: "per-head analysis skipped." ) + # Eigenvalue spectrum from the subspace method — if populated, report + # the top-20 eigenvalues per concept per layer. Tells us whether the + # concept direction lives in a single dominant dimension (λ_0 >> λ_1) + # or a spread of common directions (λ_0 ≈ λ_1 ≈ ...). + if subspace_eigvals: + for emotion, per_l in subspace_eigvals.items(): + if emotion in report: + report[emotion]["subspace_eigvals"] = { + str(l): vals for l, vals in per_l.items() + } + # Linear combinations — for each concept, how much of its direction # is explained by a ridge regression on the others. R² > 0.9 flags # concepts that are essentially linear combinations of their peers