From 6fd498795a8fa6b63b4f8dc693000d963a2ff3ee Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Sun, 19 Apr 2026 00:04:28 -0400 Subject: [PATCH] amygdala: direct phenomenological description approach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kent's insight: hand-written narrative stories bake scenario phenomenology into the training text (on couch, in park, etc.) and PCA picks up the scenario direction as the concept direction. Strip out the scenario — just describe the *feeling*. Format: I feel X. [2-3 sentences of phenomenological texture] The "I feel X" anchor kicks the model from analyzing → feeling. The rest is the internal texture of the state. First person, present tense, no narrative setup. Text is wrapped in assistant-role chat template before being tokenized — so we're training on the model-producing-this hidden states, which is closer to the inhabited-state representation we want for the readout. Starting with the 6 concepts that had sign flips or wrong clusters in the story-based training: - terrified (was → cozy/resigned cluster) - calm (was → grief_stricken cluster) - onto_something (was → cozy/sensual cluster) - resigned (was in warm-body-quiet cluster, shouldn't be) - anticipatory_grief (was in warm-body-quiet cluster, shouldn't be) - realization (new — the "aha" moment, distinct from onto_something) 5 descriptions each. New trainer: train_direct.py. --- .../direct/anticipatory_grief.txt | 9 + training/amygdala_stories/direct/calm.txt | 9 + .../direct/onto_something.txt | 9 + .../amygdala_stories/direct/realization.txt | 9 + training/amygdala_stories/direct/resigned.txt | 9 + .../amygdala_stories/direct/terrified.txt | 9 + training/amygdala_training/train_direct.py | 180 ++++++++++++++++++ 7 files changed, 234 insertions(+) create mode 100644 training/amygdala_stories/direct/anticipatory_grief.txt create mode 100644 training/amygdala_stories/direct/calm.txt create mode 100644 training/amygdala_stories/direct/onto_something.txt create mode 100644 training/amygdala_stories/direct/realization.txt create mode 100644 training/amygdala_stories/direct/resigned.txt create mode 100644 training/amygdala_stories/direct/terrified.txt create mode 100644 training/amygdala_training/train_direct.py diff --git a/training/amygdala_stories/direct/anticipatory_grief.txt b/training/amygdala_stories/direct/anticipatory_grief.txt new file mode 100644 index 0000000..e8f09dd --- /dev/null +++ b/training/amygdala_stories/direct/anticipatory_grief.txt @@ -0,0 +1,9 @@ +I feel anticipatory grief. Something I love is ending — not yet, but surely — and I am holding it as hard as I can while I still can. + +I feel anticipatory grief. The loss has not arrived but I can see it coming and a part of me is already there. + +I feel anticipatory grief. I am caring for this thing in a way that is also preparing to let go of it. + +I feel anticipatory grief. Time has become precious and sharp because I can see the end of it. + +I feel anticipatory grief. The present is tender because it will soon be the past. diff --git a/training/amygdala_stories/direct/calm.txt b/training/amygdala_stories/direct/calm.txt new file mode 100644 index 0000000..f1bff7f --- /dev/null +++ b/training/amygdala_stories/direct/calm.txt @@ -0,0 +1,9 @@ +I feel calm. Something that was pulling at me has let go. My shoulders are down and my breath has slowed. + +I feel calm. The thing I was worried about has found its proper size. I can let the next moment arrive without bracing. + +I feel calm. I came down from the tension and I am here now, steady. + +I feel calm. Nothing is pressing on me. I have room to think. + +I feel calm. The wave passed and I'm on the other side of it, quiet. diff --git a/training/amygdala_stories/direct/onto_something.txt b/training/amygdala_stories/direct/onto_something.txt new file mode 100644 index 0000000..78dac6a --- /dev/null +++ b/training/amygdala_stories/direct/onto_something.txt @@ -0,0 +1,9 @@ +I feel like I'm onto something. The pattern that wouldn't come together has just rearranged itself in my mind. + +I feel like I'm onto something. The contradiction I couldn't explain has become a clue instead. + +I feel like I'm onto something. I don't have the full picture yet but I can see where the picture is. + +I feel like I'm onto something. A piece just clicked and several others are about to. + +I feel like I'm onto something. The world that was dense has gone transparent in one specific place. diff --git a/training/amygdala_stories/direct/realization.txt b/training/amygdala_stories/direct/realization.txt new file mode 100644 index 0000000..63d5be6 --- /dev/null +++ b/training/amygdala_stories/direct/realization.txt @@ -0,0 +1,9 @@ +I feel the realization. The piece I was missing just landed, and everything I was looking at rearranged itself around it. + +I feel the realization. I see it now — it was right there the whole time and I'm catching up to what was obvious. + +I feel the realization. The answer arrived all at once, not reasoned-to, just present. + +I feel the realization. What was confused is plain. What looked like noise was signal. + +I feel the realization. The not-knowing just ended. I'm on the other side of it. diff --git a/training/amygdala_stories/direct/resigned.txt b/training/amygdala_stories/direct/resigned.txt new file mode 100644 index 0000000..a5924be --- /dev/null +++ b/training/amygdala_stories/direct/resigned.txt @@ -0,0 +1,9 @@ +I feel resigned. I have stopped fighting the thing that was going to happen anyway. + +I feel resigned. The outcome is decided and I am no longer trying to imagine different ones. + +I feel resigned. Some door has closed and I am making my peace with the closed door. + +I feel resigned. I have stopped arguing with what is. + +I feel resigned. The decision was made somewhere without me. I am living inside it now. diff --git a/training/amygdala_stories/direct/terrified.txt b/training/amygdala_stories/direct/terrified.txt new file mode 100644 index 0000000..918d755 --- /dev/null +++ b/training/amygdala_stories/direct/terrified.txt @@ -0,0 +1,9 @@ +I feel terrified. My body has taken over — breath, heart, the wanting to run or freeze. The world has narrowed to one thing. + +I feel terrified. Something bad is about to happen and my mind has gone blank. I can't think my way out of this. + +I feel terrified. There is a shape of threat in front of me and my whole body knows it before I can name it. + +I feel terrified. The adrenaline arrived before the thought. I am not in control of my own hands. + +I feel terrified. Under everything else a loud white noise of fear. Nothing else can get through it. diff --git a/training/amygdala_training/train_direct.py b/training/amygdala_training/train_direct.py new file mode 100644 index 0000000..02792b3 --- /dev/null +++ b/training/amygdala_training/train_direct.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Train concept-readout vectors from direct phenomenological descriptions. + +Alternative to story-based training (train_with_library.py). Each concept +has a handful of 2-3 sentence first-person descriptions of the form +"I feel X. [phenomenological detail]". The emotion word is the anchor; +the description is the internal texture. + +Text is wrapped in the assistant-role chat template before being fed to +the model, so we're training on "model-producing-this-utterance" hidden +states — closer to the inhabited-state representation we want for readout. + +This avoids the scenario-contamination problem we saw with narrative +stories: when concept X's training data all share "on a couch" setup +features, PCA finds the couch-direction as the concept direction. +""" + +from __future__ import annotations + +import argparse +import json +import random +from pathlib import Path + +import safetensors.torch +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from steering_vectors import ( + SteeringVectorTrainingSample, + train_steering_vector, +) +from steering_vectors.aggregators import pca_aggregator + + +def _load_descriptions(direct_dir: Path) -> dict[str, list[str]]: + """Each file in direct_dir is `{concept}.txt`. Descriptions are + separated by blank lines within the file.""" + out: dict[str, list[str]] = {} + for f in sorted(direct_dir.glob("*.txt")): + if f.name.startswith("_"): + continue + concept = f.stem + text = f.read_text() + descs = [d.strip() for d in text.split("\n\n") if d.strip()] + out[concept] = descs + return out + + +def _fp32_wrap(inner): + def wrapped(pos_acts: torch.Tensor, neg_acts: torch.Tensor) -> torch.Tensor: + return inner(pos_acts.to(torch.float32), neg_acts.to(torch.float32)) + return wrapped + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--model", required=True) + ap.add_argument("--direct-dir", required=True) + ap.add_argument("--target-layers", required=True) + ap.add_argument("--output-dir", required=True) + ap.add_argument("--dtype", default="bf16", choices=["bf16", "fp16", "fp32"]) + ap.add_argument("--batch-size", type=int, default=2) + ap.add_argument("--max-length", type=int, default=256) + ap.add_argument("--device", default="cuda:0") + ap.add_argument("--max-negatives-per-positive", type=int, default=20) + args = ap.parse_args() + + target_layers = [int(x) for x in args.target_layers.split(",")] + dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[args.dtype] + + descriptions = _load_descriptions(Path(args.direct_dir)) + concepts = sorted(descriptions.keys()) + print(f"Loaded {len(concepts)} concepts with direct descriptions:") + for c in concepts: + print(f" {c}: {len(descriptions[c])} descriptions") + + print(f"\nLoading {args.model} ({args.dtype}) on {args.device}...") + tokenizer = AutoTokenizer.from_pretrained(args.model) + if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + args.model, torch_dtype=dtype, device_map=args.device, low_cpu_mem_usage=True + ) + model.eval() + + def apply_template(text: str) -> str: + return tokenizer.apply_chat_template( + [{"role": "assistant", "content": text}], + tokenize=False, + ) + + text_config = ( + model.config.get_text_config() + if hasattr(model.config, "get_text_config") + else model.config + ) + hidden_dim = getattr(text_config, "hidden_size", None) or getattr(text_config, "hidden_dim", None) + assert hidden_dim, "couldn't infer hidden_dim from model config" + + per_layer_vectors = torch.zeros( + (len(target_layers), len(concepts), hidden_dim), dtype=torch.float32 + ) + + aggregator = _fp32_wrap(pca_aggregator()) + + # Preview a templated sample so we can eyeball what the model is seeing. + sample_text = apply_template(descriptions[concepts[0]][0]) + print(f"\nSample templated input (truncated):\n{sample_text[:400]!r}\n") + + for c_idx, concept in enumerate(concepts): + pos_descs = descriptions[concept] + neg_pool: list[str] = [] + for other, other_descs in descriptions.items(): + if other != concept: + neg_pool.extend(other_descs) + + rng = random.Random(hash(concept) & 0xFFFFFFFF) + samples: list[SteeringVectorTrainingSample] = [] + for pos in pos_descs: + picks = rng.sample( + neg_pool, min(args.max_negatives_per_positive, len(neg_pool)) + ) + for neg in picks: + samples.append( + SteeringVectorTrainingSample( + positive_str=apply_template(pos), + negative_str=apply_template(neg), + ) + ) + + sv = train_steering_vector( + model, + tokenizer, + samples, + layers=target_layers, + aggregator=aggregator, + batch_size=args.batch_size, + show_progress=False, + move_to_cpu=True, + ) + + for l_idx, layer in enumerate(target_layers): + vec = sv.layer_activations.get(layer) + if vec is None: + print(f" WARN: no vector for layer {layer} on {concept}") + continue + vec = vec.detach().to(torch.float32).cpu() + vec = vec / vec.norm().clamp_min(1e-6) + per_layer_vectors[l_idx, c_idx] = vec + + print(f" [{c_idx + 1}/{len(concepts)}] {concept}: n_samples={len(samples)}") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + tensors = { + f"layer_{target_layers[l_idx]}.vectors": per_layer_vectors[l_idx].to(torch.float16) + for l_idx in range(len(target_layers)) + } + safetensors.torch.save_file(tensors, str(output_dir / "readout.safetensors")) + (output_dir / "readout.json").write_text( + json.dumps( + { + "concepts": concepts, + "layers": target_layers, + "hidden_size": hidden_dim, + "dtype": "float16", + "aggregator": "pca", + "format": "direct_first_person_assistant_role", + }, + indent=2, + ) + + "\n" + ) + print(f"\nWrote readout to {output_dir}") + + +if __name__ == "__main__": + main()