amygdala: spectrum reporting per concept + add 'creative' direct
Chat-template retrain was a disaster (0.003 mean matched cosine vs
n20-v3; all 90+ concepts shifted). Root cause: the
steering-vectors library reads last-token activations, and with
chat template every sample ends in identical '<|im_end|>\n'
tokens — activations at that position encode 'end of assistant
turn', not content. PCA found template noise as its dominant axis.
Drop chat template; go back to raw text. Direct descriptions
('I feel X. ...') still have strong anchoring at their content
end without needing the template.
Also add per-concept spectrum logging (_pca_with_spectrum):
first_pc_ratio: λ₁ / Σλᵢ — concentration in top-1 PC
k_signal_at_90pct: how many PCs to reach 90% cumulative variance
effective_dim_signal: participation ratio over top-k (should ≈ k
if denoising is clean — Kent's spot check)
effective_dim_full: participation ratio over full spectrum
Signal/full ratio gives a sense of how much the long noise tail
is inflating the "dimensionality" measure.
Added direct/creative.txt — 'I feel creative. [...]' in 5
variants. Distinct from focused (narrow attention) and in_flow
(immersed). Creative = generative/expansive mode.
This commit is contained in:
parent
875cffd6d7
commit
417cb49339
2 changed files with 94 additions and 1 deletions
9
training/amygdala_stories/direct/creative.txt
Normal file
9
training/amygdala_stories/direct/creative.txt
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
I feel creative. Ideas are showing up on their own, and the ones I already have want to combine with each other.
|
||||||
|
|
||||||
|
I feel creative. I'm generating faster than I'm editing. The flow is the point, not the product.
|
||||||
|
|
||||||
|
I feel creative. The mental space has gone expansive — every piece of the problem is available to be played with.
|
||||||
|
|
||||||
|
I feel creative. I keep finding a new angle, and each angle suggests another.
|
||||||
|
|
||||||
|
I feel creative. I'm making something I didn't know I was going to make.
|
||||||
|
|
@ -143,6 +143,62 @@ def _fp32_wrap(inner):
|
||||||
return wrapped
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
|
def _pca_with_spectrum(spectrum_log: dict, concept_key: list[str]):
|
||||||
|
"""PCA aggregator that also records the eigenvalue spectrum of the
|
||||||
|
pos-neg deltas under ``concept_key[0]`` in ``spectrum_log``. The key is
|
||||||
|
passed by reference (a 1-element list) so we can rebind it per concept
|
||||||
|
without recreating the aggregator closure."""
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def agg(pos_acts: torch.Tensor, neg_acts: torch.Tensor) -> torch.Tensor:
|
||||||
|
pos = pos_acts.to(torch.float32)
|
||||||
|
neg = neg_acts.to(torch.float32)
|
||||||
|
deltas = pos - neg
|
||||||
|
# Uncentered PCA: concatenate deltas and -deltas (library convention).
|
||||||
|
X = torch.cat([deltas, -deltas])
|
||||||
|
# Eigenvalues via SVD: sigma^2 are the variances along each PC.
|
||||||
|
# torch.linalg.svd returns U, S, Vh where columns of Vh.T are PCs.
|
||||||
|
_, s, vh = torch.linalg.svd(X, full_matrices=False)
|
||||||
|
variances = (s ** 2)
|
||||||
|
total = variances.sum().item()
|
||||||
|
var_list = variances.tolist()
|
||||||
|
first_pc_ratio = var_list[0] / total if total > 0 else 0.0
|
||||||
|
|
||||||
|
# Participation ratio over the FULL spectrum — includes noise tail.
|
||||||
|
eff_dim_full = (total ** 2) / float((variances ** 2).sum().item() or 1.0)
|
||||||
|
|
||||||
|
# Signal/noise split: find smallest k with cumulative variance ≥ 0.9,
|
||||||
|
# then compute PR over just those top-k eigenvalues. If PCA denoising
|
||||||
|
# is clean, eff_dim_signal should ≈ k_signal (the retained dims carry
|
||||||
|
# roughly equal variance, with the noise tail dropped).
|
||||||
|
cum = 0.0
|
||||||
|
k_signal = len(var_list)
|
||||||
|
for i, v in enumerate(var_list):
|
||||||
|
cum += v
|
||||||
|
if cum / total >= 0.9:
|
||||||
|
k_signal = i + 1
|
||||||
|
break
|
||||||
|
top_vars = variances[:k_signal]
|
||||||
|
top_total = top_vars.sum().item()
|
||||||
|
eff_dim_signal = (top_total ** 2) / float((top_vars ** 2).sum().item() or 1.0)
|
||||||
|
|
||||||
|
spectrum_log[concept_key[0]] = {
|
||||||
|
"first_pc_ratio": round(first_pc_ratio, 4),
|
||||||
|
"effective_dim_full": round(eff_dim_full, 3),
|
||||||
|
"k_signal_at_90pct": k_signal,
|
||||||
|
"effective_dim_signal": round(eff_dim_signal, 3),
|
||||||
|
"top10_eigenvalues": [round(v, 4) for v in var_list[:10]],
|
||||||
|
"total_variance": round(total, 4),
|
||||||
|
}
|
||||||
|
# Top-1 PC
|
||||||
|
vec = vh[0]
|
||||||
|
# Sign-flip so the direction aligns with most deltas (library convention).
|
||||||
|
sign = torch.sign(torch.mean(deltas @ vec))
|
||||||
|
return sign * vec
|
||||||
|
|
||||||
|
return agg
|
||||||
|
|
||||||
|
|
||||||
def _aggregator_from_name(name: str):
|
def _aggregator_from_name(name: str):
|
||||||
if name == "mean":
|
if name == "mean":
|
||||||
return _fp32_wrap(mean_aggregator())
|
return _fp32_wrap(mean_aggregator())
|
||||||
|
|
@ -243,6 +299,12 @@ def main() -> None:
|
||||||
(len(target_layers), len(emotions), hidden_dim), dtype=torch.float32
|
(len(target_layers), len(emotions), hidden_dim), dtype=torch.float32
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Optional spectrum-logging aggregator (only for --aggregator pca).
|
||||||
|
spectrum_log: dict = {}
|
||||||
|
concept_key = [""]
|
||||||
|
if args.aggregator == "pca":
|
||||||
|
aggregator = _pca_with_spectrum(spectrum_log, concept_key)
|
||||||
|
else:
|
||||||
aggregator = _aggregator_from_name(args.aggregator)
|
aggregator = _aggregator_from_name(args.aggregator)
|
||||||
|
|
||||||
wrap = (lambda s: _chat_template_wrap(tokenizer, s)) if args.chat_template else None
|
wrap = (lambda s: _chat_template_wrap(tokenizer, s)) if args.chat_template else None
|
||||||
|
|
@ -262,6 +324,8 @@ def main() -> None:
|
||||||
print(f" [{e_idx + 1}/{len(emotions)}] {emotion}: NO SAMPLES, skipping")
|
print(f" [{e_idx + 1}/{len(emotions)}] {emotion}: NO SAMPLES, skipping")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
concept_key[0] = emotion # tell the aggregator which concept is being trained
|
||||||
|
|
||||||
sv = train_steering_vector(
|
sv = train_steering_vector(
|
||||||
model,
|
model,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
|
|
@ -311,6 +375,26 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
+ "\n"
|
+ "\n"
|
||||||
)
|
)
|
||||||
|
if spectrum_log:
|
||||||
|
(output_dir / "spectrum.json").write_text(json.dumps(spectrum_log, indent=2) + "\n")
|
||||||
|
print("\n=== eigenvalue spectrum per concept ===")
|
||||||
|
print(
|
||||||
|
" concept first_pc k_90pct "
|
||||||
|
"eff_dim_signal eff_dim_full (signal/k ratio)"
|
||||||
|
)
|
||||||
|
items = sorted(spectrum_log.items(), key=lambda kv: -kv[1]["first_pc_ratio"])
|
||||||
|
for concept, stats in items:
|
||||||
|
k = stats["k_signal_at_90pct"]
|
||||||
|
eff_sig = stats["effective_dim_signal"]
|
||||||
|
ratio = eff_sig / k if k else 0.0
|
||||||
|
print(
|
||||||
|
f" {concept:22s} "
|
||||||
|
f"{stats['first_pc_ratio']:>8.3f} "
|
||||||
|
f"{k:>7d} "
|
||||||
|
f"{eff_sig:>14.2f} "
|
||||||
|
f"{stats['effective_dim_full']:>12.2f} "
|
||||||
|
f"({ratio:.2f})"
|
||||||
|
)
|
||||||
|
|
||||||
total_mb = sum(t.numel() * 2 for t in tensors.values()) / (1024 * 1024)
|
total_mb = sum(t.numel() * 2 for t in tensors.values()) / (1024 * 1024)
|
||||||
print(
|
print(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue