From 974c6c7fd2a5100cb96dc9dddc3fa672be5d6ec2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Apr 2026 21:33:48 -0400
Subject: [PATCH] amygdala: report eigenvalue spectrum for subspace method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When --method subspace, record top-20 eigenvalues of (M_pos - M_base)
per concept per layer. Added to quality.json as 'subspace_eigvals'.

Tells us whether the concept lives in a single dominant direction
(λ_0 >> λ_1, top-eigenvector is enough) or a spread of shared common
directions (λ_0 ≈ λ_1, top-1 loses signal).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
---
 .../train_steering_vectors.py                 | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/training/amygdala_training/train_steering_vectors.py b/training/amygdala_training/train_steering_vectors.py
index ba8fa5d..b44df2c 100644
--- a/training/amygdala_training/train_steering_vectors.py
+++ b/training/amygdala_training/train_steering_vectors.py
@@ -1001,6 +1001,9 @@ def main() -> None:
     # and use sum-of-projection-operators per concept. --------------------
     pos_subspaces: list[dict[int, torch.Tensor]] | None = None
     base_subspaces: list[dict[int, torch.Tensor]] | None = None
+    # Per (concept, layer): top-20 eigenvalues of (M_pos - M_base), descending.
+    # Populated only when --method subspace.
+    subspace_eigvals: dict[str, dict[int, list[float]]] = {}
     if args.method == "subspace":
         print("\nCollecting per-story subspaces (SVD, top-k right singular "
               f"vectors, k={args.subspace_k})...")
@@ -1035,10 +1038,14 @@ def main() -> None:
                 pos_V = [pos_subspaces[j][target_l] for j in pos_rows]
                 base_V = [pos_subspaces[j][target_l] for j in neg_rows]
                 base_V += [bs[target_l] for bs in (base_subspaces or [])]
-                top_vec, _eigvals = _subspace_concept_direction(
+                top_vec, eigvals = _subspace_concept_direction(
                     pos_V, base_V, hidden=hidden_dim,
                 )
                 per_layer_vectors[l_idx, e_idx] = top_vec
+                # Keep the top-20 eigenvalues for quality-report diagnostics.
+                subspace_eigvals.setdefault(emotion, {})[target_l] = (
+                    eigvals[-20:].flip(0).tolist()
+                )
         else:
             pos = positive_acts[pos_rows]         # [n_pos, n_layers, hidden]
             neg = positive_acts[neg_rows]         # [n_neg, n_layers, hidden]
@@ -1149,6 +1156,17 @@ def main() -> None:
                 "per-head analysis skipped."
             )
 
+        # Eigenvalue spectrum from the subspace method — if populated, report
+        # the top-20 eigenvalues per concept per layer. Tells us whether the
+        # concept direction lives in a single dominant dimension (λ_0 >> λ_1)
+        # or a spread of common directions (λ_0 ≈ λ_1 ≈ ...).
+        if subspace_eigvals:
+            for emotion, per_l in subspace_eigvals.items():
+                if emotion in report:
+                    report[emotion]["subspace_eigvals"] = {
+                        str(l): vals for l, vals in per_l.items()
+                    }
+
         # Linear combinations — for each concept, how much of its direction
         # is explained by a ridge regression on the others. R² > 0.9 flags
         # concepts that are essentially linear combinations of their peers