forked from kent/consciousness
amygdala: quality-report + cognitive-state training scenarios
Training pipeline additions:
- `--quality-report` flag: after producing per-concept vectors, compute
per-concept diagnostics and write quality.json. Metrics per concept:
* SVD of centered positives -> first_pc_variance_ratio (rank
analysis; >0.7 clean, <0.4 fragmented)
* Per-story alignment cosines (stories agree or disagree)
* Single-neuron alignment: best cosine(direction, W_down column)
at each target layer (>0.6 = essentially one MLP neuron)
* Top-2 outlier stories by alignment (candidates for
mislabeling or off-topic)
* Top-5 nearest concepts by cosine (cross-concept contamination)
Triage summary printed at end.
New paired scenarios for cognitive-process states (for alpha-beta
pruning): tracing_a_bug, reading_unfamiliar_code, finding_the_abstraction.
Each has baseline + onto_something / stuck / in_flow / determined
variants.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
5f06577ead
commit
ce24d9ce6b
14 changed files with 249 additions and 0 deletions
|
|
@ -216,6 +216,182 @@ def _load_corpus(stories_dir: Path, paired_dir: Path | None) -> tuple[
|
|||
return positives, baselines
|
||||
|
||||
|
||||
def _find_mlp_down_proj(model, layer_idx: int) -> torch.Tensor | None:
|
||||
"""Return the W_down weight for the MLP at the given transformer layer.
|
||||
|
||||
Looks for the common paths (mlp.down_proj, mlp.c_proj, feed_forward.down_proj).
|
||||
Returns None if nothing matches — downstream code skips the single-neuron
|
||||
alignment check in that case rather than failing.
|
||||
"""
|
||||
layers = _find_layers_module(model)
|
||||
layer = layers[layer_idx]
|
||||
for path in ("mlp.down_proj", "mlp.c_proj", "feed_forward.down_proj"):
|
||||
obj = layer
|
||||
ok = True
|
||||
for part in path.split("."):
|
||||
if not hasattr(obj, part):
|
||||
ok = False
|
||||
break
|
||||
obj = getattr(obj, part)
|
||||
if ok and hasattr(obj, "weight"):
|
||||
# Shape convention: [hidden, mlp_inner] — each column is one
|
||||
# MLP neuron's contribution direction into the residual stream.
|
||||
return obj.weight.detach()
|
||||
return None
|
||||
|
||||
|
||||
def _compute_quality_report(
|
||||
emotions: list[str],
|
||||
positive_acts: torch.Tensor, # [n_positive_stories, n_layers, hidden]
|
||||
baseline_acts: torch.Tensor, # [n_baseline_stories, n_layers, hidden]
|
||||
positives_by_emotion: dict[str, list[str]],
|
||||
text_to_row: dict[str, int],
|
||||
per_layer_vectors: torch.Tensor, # [n_layers, n_concepts, hidden], unit-normed
|
||||
target_layers: list[int],
|
||||
model,
|
||||
positive_texts: list[str],
|
||||
text_to_emotion: dict[str, str],
|
||||
) -> dict:
|
||||
"""Per-concept quality metrics:
|
||||
|
||||
- first_pc_variance_ratio: SVD on centered positive activations.
|
||||
>0.7 = rank-1 (clean). <0.4 = fragmented (stories disagree).
|
||||
- story_projection_*: how each positive story projects onto the
|
||||
concept direction. Low std = tight agreement.
|
||||
- best_neuron_cosine: alignment of the residual-space direction with
|
||||
the nearest W_down column (= single MLP neuron). >0.6 = essentially
|
||||
single-neuron.
|
||||
- nearest_concepts: top-5 concept directions most parallel to this
|
||||
one. Cosine >0.8 means the vector is confused with a neighbor.
|
||||
"""
|
||||
report: dict = {}
|
||||
n_layers = per_layer_vectors.shape[0]
|
||||
|
||||
# Pre-compute per-layer W_down for single-neuron alignment.
|
||||
w_down: dict[int, torch.Tensor] = {}
|
||||
for target_l in target_layers:
|
||||
w = _find_mlp_down_proj(model, target_l)
|
||||
if w is not None:
|
||||
# Unit-normalize each column (one per MLP neuron).
|
||||
w = w.to(torch.float32)
|
||||
norms = w.norm(dim=0, keepdim=True).clamp_min(1e-6)
|
||||
w_down[target_l] = w / norms # [hidden, mlp_inner]
|
||||
|
||||
# Pre-compute unit-normed concept vectors (for cross-concept cosines).
|
||||
vec_norm = per_layer_vectors / per_layer_vectors.norm(
|
||||
dim=-1, keepdim=True
|
||||
).clamp_min(1e-6)
|
||||
|
||||
for e_idx, emotion in enumerate(emotions):
|
||||
pos_rows = [text_to_row[t] for t in positives_by_emotion[emotion]]
|
||||
pos = positive_acts[pos_rows].to(torch.float32) # [n_pos, n_layers, hidden]
|
||||
|
||||
per_layer: dict = {}
|
||||
for l_idx, target_l in enumerate(target_layers):
|
||||
pos_l = pos[:, l_idx, :] # [n_pos, hidden]
|
||||
diff_l = per_layer_vectors[l_idx, e_idx] # [hidden], unit-normed
|
||||
pos_mean_l = pos_l.mean(dim=0)
|
||||
|
||||
# SVD for rank analysis — if first PC dominates, stories agree.
|
||||
centered = pos_l - pos_mean_l
|
||||
# svdvals errors on 1-row; handle that.
|
||||
if centered.shape[0] >= 2:
|
||||
S = torch.linalg.svdvals(centered)
|
||||
var = S ** 2
|
||||
var_total = var.sum().clamp_min(1e-12)
|
||||
var_ratios = (var / var_total).tolist()
|
||||
else:
|
||||
var_ratios = [1.0]
|
||||
|
||||
# Per-story projection onto the concept direction.
|
||||
projections = pos_l @ diff_l # [n_pos]
|
||||
|
||||
# Per-story alignment: cosine(story_dir, concept_dir) where
|
||||
# story_dir = pos_i - pos_mean (centered, pointing away from center).
|
||||
if centered.shape[0] >= 2:
|
||||
centered_norm = centered / centered.norm(
|
||||
dim=-1, keepdim=True
|
||||
).clamp_min(1e-6)
|
||||
alignments = centered_norm @ diff_l
|
||||
else:
|
||||
alignments = torch.zeros(1)
|
||||
|
||||
# Single-neuron alignment: is the direction close to any
|
||||
# W_down column?
|
||||
nb_best_idx = None
|
||||
nb_best_cos = None
|
||||
nb_top5 = None
|
||||
if target_l in w_down:
|
||||
W = w_down[target_l]
|
||||
cos = W.t() @ diff_l # [mlp_inner]
|
||||
abs_cos = cos.abs()
|
||||
k = min(5, abs_cos.shape[0])
|
||||
top_vals, top_idxs = abs_cos.topk(k)
|
||||
nb_best_idx = int(top_idxs[0])
|
||||
nb_best_cos = float(cos[top_idxs[0]])
|
||||
nb_top5 = [[int(i), float(cos[i])] for i in top_idxs]
|
||||
|
||||
per_layer[str(target_l)] = {
|
||||
"top3_variance_ratios": [
|
||||
float(v) for v in var_ratios[:3]
|
||||
],
|
||||
"first_pc_variance_ratio": float(var_ratios[0]),
|
||||
"story_projection_mean": float(projections.mean()),
|
||||
"story_projection_std": float(projections.std()),
|
||||
"story_projection_min": float(projections.min()),
|
||||
"story_projection_max": float(projections.max()),
|
||||
"story_alignment_mean": float(alignments.mean()),
|
||||
"story_alignment_std": float(alignments.std()),
|
||||
"best_neuron_idx": nb_best_idx,
|
||||
"best_neuron_cosine": nb_best_cos,
|
||||
"top5_neurons": nb_top5,
|
||||
}
|
||||
|
||||
# Outlier stories: lowest-aligned on the middle target layer.
|
||||
mid = n_layers // 2
|
||||
pos_l_mid = pos[:, mid, :]
|
||||
mid_mean = pos_l_mid.mean(dim=0)
|
||||
mid_diff = per_layer_vectors[mid, e_idx]
|
||||
centered_mid = pos_l_mid - mid_mean
|
||||
if centered_mid.shape[0] >= 2:
|
||||
centered_mid_norm = centered_mid / centered_mid.norm(
|
||||
dim=-1, keepdim=True
|
||||
).clamp_min(1e-6)
|
||||
mid_aligns = centered_mid_norm @ mid_diff # [n_pos]
|
||||
# Lowest two alignments = candidate outliers.
|
||||
k = min(2, mid_aligns.shape[0])
|
||||
low_vals, low_idxs = mid_aligns.topk(k, largest=False)
|
||||
outliers = [
|
||||
[
|
||||
positives_by_emotion[emotion][int(i)],
|
||||
float(mid_aligns[i]),
|
||||
]
|
||||
for i in low_idxs
|
||||
]
|
||||
else:
|
||||
outliers = []
|
||||
|
||||
# Nearest other concepts at the middle target layer.
|
||||
this_norm = vec_norm[mid, e_idx]
|
||||
all_cos = vec_norm[mid] @ this_norm # [n_concepts]
|
||||
all_cos[e_idx] = -2.0 # mask self
|
||||
k = min(5, all_cos.shape[0] - 1)
|
||||
top_vals, top_idxs = all_cos.topk(k)
|
||||
nearest = [
|
||||
[emotions[int(i)], float(v)]
|
||||
for i, v in zip(top_idxs, top_vals)
|
||||
]
|
||||
|
||||
report[emotion] = {
|
||||
"n_positive_stories": len(pos_rows),
|
||||
"per_layer": per_layer,
|
||||
"outlier_stories": outliers,
|
||||
"nearest_concepts": nearest,
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--model", required=True, help="HF model id or path")
|
||||
|
|
@ -249,6 +425,13 @@ def main() -> None:
|
|||
default=1,
|
||||
help="Skip emotions with fewer positive examples than this",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--quality-report",
|
||||
action="store_true",
|
||||
help="After training, compute a per-concept quality report "
|
||||
"(SVD rank, per-story alignment, single-neuron alignment, "
|
||||
"nearest-concept contamination) and write quality.json",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
target_layers = [int(x) for x in args.target_layers.split(",")]
|
||||
|
|
@ -445,6 +628,59 @@ def main() -> None:
|
|||
f" {n_concepts} concepts x {n_layers} layers x "
|
||||
f"{hidden_dim} dim (fp16), total {total_mb:.1f} MiB"
|
||||
)
|
||||
|
||||
if args.quality_report:
|
||||
print("\nComputing quality report...")
|
||||
report = _compute_quality_report(
|
||||
emotions=emotions,
|
||||
positive_acts=positive_acts,
|
||||
baseline_acts=baseline_acts,
|
||||
positives_by_emotion=positives_by_emotion,
|
||||
text_to_row=text_to_row,
|
||||
per_layer_vectors=per_layer_vectors,
|
||||
target_layers=target_layers,
|
||||
model=model,
|
||||
positive_texts=unique_positive_texts,
|
||||
text_to_emotion=text_to_emotion,
|
||||
)
|
||||
(output_dir / "quality.json").write_text(
|
||||
json.dumps(report, indent=2) + "\n"
|
||||
)
|
||||
|
||||
# Short summary: concepts in each triage bucket.
|
||||
clean_single_neuron = []
|
||||
clean_circuit = []
|
||||
fragmented = []
|
||||
contaminated = []
|
||||
mid = n_layers // 2
|
||||
mid_layer = target_layers[mid]
|
||||
for emotion in emotions:
|
||||
per_l = report[emotion]["per_layer"][str(mid_layer)]
|
||||
v = per_l["first_pc_variance_ratio"]
|
||||
nb = per_l.get("best_neuron_cosine") or 0.0
|
||||
top_near = report[emotion]["nearest_concepts"]
|
||||
nearest_cos = top_near[0][1] if top_near else 0.0
|
||||
if nearest_cos > 0.8:
|
||||
contaminated.append(emotion)
|
||||
elif v > 0.7 and abs(nb) > 0.6:
|
||||
clean_single_neuron.append(emotion)
|
||||
elif v > 0.7:
|
||||
clean_circuit.append(emotion)
|
||||
elif v < 0.4:
|
||||
fragmented.append(emotion)
|
||||
print(
|
||||
f"\nQuality summary @ layer {mid_layer}:\n"
|
||||
f" clean (single-neuron): {len(clean_single_neuron)}\n"
|
||||
f" clean (low-dim circuit): {len(clean_circuit)}\n"
|
||||
f" fragmented (first-PC < 0.4): {len(fragmented)}\n"
|
||||
f" contaminated (nearest > 0.8): {len(contaminated)}"
|
||||
)
|
||||
if fragmented:
|
||||
print(f" fragmented sample: {fragmented[:5]}")
|
||||
if contaminated:
|
||||
print(f" contaminated sample: {contaminated[:5]}")
|
||||
print(f"\nWrote quality.json to {output_dir}")
|
||||
|
||||
del model
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue