From 6fedc9b2a8d80ab0e1bd2487a7f9ddd062aec32e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 Apr 2026 11:53:42 -0400
Subject: [PATCH] amygdala: underscore-prefixed files join every concept's
 negative pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Files in direct/ named _*.txt (e.g. _baseline.txt) are conceptless
neutral prose — they should not appear as positive training signal,
but are useful as shared negatives across every concept.

Previously _*.txt files were silently skipped. Now:
  * they're loaded like any other description file;
  * concepts (the positive label set) filters them out;
  * their descriptions are concatenated into neg_pool_extra and
    extended onto every concept's neg_pool alongside the cross-concept
    negatives.

A concept's negative pool is thus "other concepts' descriptions +
everything from _*.txt files". The extra pool is announced at startup
so the user can see how many neutral samples are active.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
---
 training/amygdala_training/train_direct.py | 23 ++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/training/amygdala_training/train_direct.py b/training/amygdala_training/train_direct.py
index 8749e37..2ad2a30 100644
--- a/training/amygdala_training/train_direct.py
+++ b/training/amygdala_training/train_direct.py
@@ -35,12 +35,11 @@ from steering_vectors.aggregators import pca_aggregator
 
 def _load_descriptions(direct_dir: Path) -> dict[str, list[str]]:
     """Each file in direct_dir is `{concept}.txt`. Descriptions are
-    separated by blank lines within the file."""
+    separated by blank lines within the file. Files starting with `_`
+    are not concepts but are included in negative pools (e.g. _baseline.txt)."""
     out: dict[str, list[str]] = {}
     for f in sorted(direct_dir.glob("*.txt")):
-        if f.name.startswith("_"):
-            continue
-        concept = f.stem
+        concept = f.stem  # underscore-prefixed names keep their prefix
         text = f.read_text()
         descs = [d.strip() for d in text.split("\n\n") if d.strip()]
         out[concept] = descs
@@ -69,11 +68,19 @@ def main() -> None:
     target_layers = [int(x) for x in args.target_layers.split(",")]
     dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[args.dtype]
 
-    descriptions = _load_descriptions(Path(args.direct_dir))
-    concepts = sorted(descriptions.keys())
+    all_descriptions = _load_descriptions(Path(args.direct_dir))
+    # Files starting with `_` are neg-pool helpers (e.g. _baseline.txt), not concepts.
+    concepts = sorted(k for k in all_descriptions if not k.startswith("_"))
+    neg_pool_extra: list[str] = []
+    for k, ds in all_descriptions.items():
+        if k.startswith("_"):
+            neg_pool_extra.extend(ds)
+    descriptions = {k: all_descriptions[k] for k in concepts}
     print(f"Loaded {len(concepts)} concepts with direct descriptions:")
     for c in concepts:
         print(f"  {c}: {len(descriptions[c])} descriptions")
+    if neg_pool_extra:
+        print(f"Plus {len(neg_pool_extra)} neutral/baseline descriptions added to every concept's negative pool")
 
     print(f"\nLoading {args.model} ({args.dtype}) on {args.device}...")
     tokenizer = AutoTokenizer.from_pretrained(args.model)
@@ -117,6 +124,10 @@ def main() -> None:
         for other, other_descs in descriptions.items():
             if other != concept:
                 neg_pool.extend(other_descs)
+        # Underscore-prefixed files (e.g. _baseline.txt) contribute to
+        # every concept's negative pool, independent of the other-
+        # concept negatives.
+        neg_pool.extend(neg_pool_extra)
 
         rng = random.Random(hash(concept) & 0xFFFFFFFF)
         samples: list[SteeringVectorTrainingSample] = []