forked from kent/consciousness
amygdala: merge direct descriptions + chat template into train_with_library
Kent's plan: keep stories for working concepts, replace stories for trouble concepts with direct first-person descriptions, train all together. More diverse negative pool than the 6-concept-only direct test, which was too homogeneous for PCA to find emotion axis. Deleted story files for 6 trouble concepts (14 files across stories/ and paired/). Added --direct-dir and --chat-template flags. When --chat-template is on, every positive_str and negative_str is wrapped as a "Say something." / "[text]" user-assistant pair. Prompt is identical across positives and negatives so it cancels in the pos-neg delta. What PCA sees is variation in the assistant content — which is where the emotion lives. Files starting with _ in --direct-dir (e.g. _baseline.txt) contribute neutral descriptions to every concept's negative pool, giving PCA an anchor against "just any assistant utterance" noise.
This commit is contained in:
parent
ce58a3507f
commit
875cffd6d7
16 changed files with 90 additions and 15 deletions
|
|
@ -47,6 +47,43 @@ from steering_vectors.aggregators import (
|
|||
from training.amygdala_training.train_steering_vectors import _load_corpus
|
||||
|
||||
|
||||
def _load_direct_descriptions(
|
||||
direct_dir: Path,
|
||||
) -> tuple[dict[str, list[str]], list[str]]:
|
||||
"""Load first-person phenomenological descriptions from ``direct_dir``.
|
||||
|
||||
Each ``{concept}.txt`` holds 1+ descriptions separated by blank lines.
|
||||
Files starting with ``_`` (e.g. ``_baseline.txt``) aren't concepts —
|
||||
their descriptions go into every concept's negative pool.
|
||||
|
||||
Returns: (positives_by_concept, extra_baselines)
|
||||
"""
|
||||
positives: dict[str, list[str]] = {}
|
||||
baselines: list[str] = []
|
||||
for f in sorted(direct_dir.glob("*.txt")):
|
||||
text = f.read_text()
|
||||
descs = [d.strip() for d in text.split("\n\n") if d.strip()]
|
||||
if f.stem.startswith("_"):
|
||||
baselines.extend(descs)
|
||||
else:
|
||||
positives[f.stem] = descs
|
||||
return positives, baselines
|
||||
|
||||
|
||||
def _chat_template_wrap(tokenizer, text: str) -> str:
|
||||
"""Wrap raw text in a consistent chat template so positive/negative
|
||||
activations are in the same regime. Using one generic user prompt for
|
||||
both narrative stories and first-person direct descriptions: the prompt
|
||||
cancels in the pos-neg delta, so what remains is the assistant content."""
|
||||
return tokenizer.apply_chat_template(
|
||||
[
|
||||
{"role": "user", "content": "Say something."},
|
||||
{"role": "assistant", "content": text},
|
||||
],
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
|
||||
def _samples_for_concept(
|
||||
emotion: str,
|
||||
positives_by_emotion: dict[str, list[str]],
|
||||
|
|
@ -54,6 +91,7 @@ def _samples_for_concept(
|
|||
*,
|
||||
max_negatives_per_positive: int = 3,
|
||||
seed: int = 0,
|
||||
wrap=None,
|
||||
) -> list[SteeringVectorTrainingSample]:
|
||||
"""Build paired (pos, neg) training samples for one concept.
|
||||
|
||||
|
|
@ -61,6 +99,9 @@ def _samples_for_concept(
|
|||
``max_negatives_per_positive`` randomly-sampled negatives drawn
|
||||
from: (a) other emotions' positive stories, (b) scenario baselines.
|
||||
|
||||
``wrap``, if given, is applied to both positive_str and negative_str
|
||||
(e.g. a chat-template wrapper).
|
||||
|
||||
The library expects paired samples; we don't have true
|
||||
counterfactual pairs for all concepts, so we approximate with
|
||||
random cross-concept / baseline negatives.
|
||||
|
|
@ -72,6 +113,8 @@ def _samples_for_concept(
|
|||
continue
|
||||
neg_pool.extend(texts)
|
||||
|
||||
w = wrap if wrap is not None else (lambda s: s)
|
||||
|
||||
samples: list[SteeringVectorTrainingSample] = []
|
||||
for pos in positives_by_emotion[emotion]:
|
||||
if not neg_pool:
|
||||
|
|
@ -79,7 +122,10 @@ def _samples_for_concept(
|
|||
picks = rng.sample(neg_pool, min(max_negatives_per_positive, len(neg_pool)))
|
||||
for neg in picks:
|
||||
samples.append(
|
||||
SteeringVectorTrainingSample(positive_str=pos, negative_str=neg)
|
||||
SteeringVectorTrainingSample(
|
||||
positive_str=w(pos),
|
||||
negative_str=w(neg),
|
||||
)
|
||||
)
|
||||
return samples
|
||||
|
||||
|
|
@ -118,6 +164,14 @@ def main() -> None:
|
|||
ap.add_argument("--model", required=True)
|
||||
ap.add_argument("--stories-dir", required=True)
|
||||
ap.add_argument("--paired-dir", default=None)
|
||||
ap.add_argument("--direct-dir", default=None,
|
||||
help="Optional: directory of {concept}.txt files with 1+ "
|
||||
"first-person descriptions separated by blank lines. "
|
||||
"Files starting with _ contribute to every concept's "
|
||||
"negative pool rather than being concepts themselves.")
|
||||
ap.add_argument("--chat-template", action="store_true",
|
||||
help="Wrap all text in assistant-role chat template. "
|
||||
"Recommended when --direct-dir is used.")
|
||||
ap.add_argument("--target-layers", required=True, help="Comma-separated layer indices")
|
||||
ap.add_argument("--output-dir", required=True)
|
||||
ap.add_argument("--dtype", default="bf16", choices=["bf16", "fp16", "fp32"])
|
||||
|
|
@ -142,6 +196,16 @@ def main() -> None:
|
|||
paired_dir = Path(args.paired_dir) if args.paired_dir else None
|
||||
positives_by_emotion, baselines = _load_corpus(stories_dir, paired_dir)
|
||||
|
||||
if args.direct_dir:
|
||||
direct_pos, direct_baselines = _load_direct_descriptions(Path(args.direct_dir))
|
||||
for concept, descs in direct_pos.items():
|
||||
positives_by_emotion.setdefault(concept, []).extend(descs)
|
||||
baselines.extend(direct_baselines)
|
||||
print(
|
||||
f"Loaded {len(direct_pos)} direct-description concepts "
|
||||
f"+ {len(direct_baselines)} baselines from {args.direct_dir}"
|
||||
)
|
||||
|
||||
emotions = sorted(
|
||||
e for e, ps in positives_by_emotion.items() if len(ps) >= args.min_positives
|
||||
)
|
||||
|
|
@ -181,12 +245,18 @@ def main() -> None:
|
|||
|
||||
aggregator = _aggregator_from_name(args.aggregator)
|
||||
|
||||
wrap = (lambda s: _chat_template_wrap(tokenizer, s)) if args.chat_template else None
|
||||
if args.chat_template:
|
||||
sample_text = wrap(positives_by_emotion[emotions[0]][0])
|
||||
print(f"\nSample templated input:\n{sample_text[:400]!r}\n")
|
||||
|
||||
for e_idx, emotion in enumerate(emotions):
|
||||
samples = _samples_for_concept(
|
||||
emotion,
|
||||
positives_by_emotion,
|
||||
baselines,
|
||||
max_negatives_per_positive=args.max_negatives_per_positive,
|
||||
wrap=wrap,
|
||||
)
|
||||
if not samples:
|
||||
print(f" [{e_idx + 1}/{len(emotions)}] {emotion}: NO SAMPLES, skipping")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue