From 389f1bbe03eaa75f9e793afb35a400712f1341c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 18 Apr 2026 21:41:00 -0400 Subject: [PATCH] amygdala: bump subspace-k default to 512 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit k=20 was far too aggressive a truncation — it discards per-attention-head discriminability entirely. At hidden_dim=5120, 40 heads × head_dim=128 each contribute their own 128-dim block to the residual stream via W_o columns. To resolve 'this concept lives in head H', per-story SVD needs enough rank to separate head contributions, which means k on the order of hundreds. 512 is a reasonable default: clamped to n_tokens per story so short stories use their full natural rank. The eigenvalue spectrum of M_pos - M_base should become sharper (larger λ_0/λ_1 gap) as we stop averaging across nuisance-shared directions. Co-Authored-By: Proof of Concept --- training/amygdala_training/train_steering_vectors.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/training/amygdala_training/train_steering_vectors.py b/training/amygdala_training/train_steering_vectors.py index b44df2c..54603ab 100644 --- a/training/amygdala_training/train_steering_vectors.py +++ b/training/amygdala_training/train_steering_vectors.py @@ -850,8 +850,12 @@ def main() -> None: ap.add_argument( "--subspace-k", type=int, - default=20, - help="Top-k right singular vectors per story for subspace method", + default=512, + help="Max top-k right singular vectors per story for subspace method " + "(clamped to n_tokens per story). Default 512 is enough to span " + "each story's full natural subspace including per-attention-head " + "contributions on a hidden_dim=5120 residual stream. Smaller " + "values (e.g. 20) discard per-head discriminability.", ) ap.add_argument( "--quality-report",