amygdala: per-head attention decomposition diagnostic

As part of --quality-report, run a second forward pass capturing the input to each target layer's o_proj (= concat of per-head attention outputs before the output projection). For each concept, reshape to [n_heads, head_dim] and rank heads by diff-of-means magnitude / per-head selectivity (magnitude normalised by negative std). Motivation: the Wang et al. paper (2510.11328) — whose paired-scenario methodology we already lifted — further decomposes concept circuits at the attention-head level. Meta-relational concepts (recognition, trust, vulnerability) plausibly live in a sparse attention-head circuit rather than in the residual-stream sum, which would explain why diff-of-means on the residual blurs them. This diagnostic surfaces that. Output is folded into quality.json under each concept as "per_head": per (layer) a list of top-10 heads with [head_idx, raw_norm, selectivity], plus head_concentration (fraction of total head-norm captured by those top heads). Interpretation: - head_concentration > 0.5 = sparse head circuit; a handful of heads route the concept. Worth building a head-level readout for. - head_concentration ~= n/k for n heads = concept is distributed across all heads ~evenly; residual-stream diff-of-means is doing fine. Hybrid layers (Mamba, GatedDeltaNet) whose attention path doesn't match the standard module layout are silently skipped. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-18 20:37:44 -04:00 · 2026-04-18 20:37:44 -04:00 · af17b0f0df
commit af17b0f0df
parent ce24d9ce6b
1 changed files with 240 additions and 0 deletions
--- a/training/amygdala_training/train_steering_vectors.py
+++ b/training/amygdala_training/train_steering_vectors.py
@ -216,6 +216,203 @@ def _load_corpus(stories_dir: Path, paired_dir: Path | None) -> tuple[
    return positives, baselines
 def _find_o_proj(layer) -> torch.nn.Module | None:
    """Locate the attention output projection within a transformer layer."""
    for path in (
        "self_attn.o_proj",
        "self_attn.out_proj",
        "attention.o_proj",
        "attn.out_proj",
    ):
        obj = layer
        ok = True
        for part in path.split("."):
            if not hasattr(obj, part):
                ok = False
                break
            obj = getattr(obj, part)
        if ok:
            return obj
    return None
 def _collect_attention_inputs(
    model,
    tokenizer,
    texts: list[str],
    target_layers: list[int],
    device: torch.device,
    batch_size: int,
    max_length: int,
    *,
    label: str = "",
 ) -> tuple[torch.Tensor, list[int]]:
    """Capture the INPUT to o_proj at each target layer (= concat of per-head
    attention outputs right before the output projection).
    Returns (tensor [n_texts, n_active_layers, hidden_dim], active_layers).
    The active_layers list is the subset of target_layers whose attention
    module exposed a recognisable o_proj path — hybrid layers (Mamba, etc.)
    may be silently skipped.
    """
    import time
    layers_module = _find_layers_module(model)
    captures: dict[int, torch.Tensor] = {}
    handles = []
    active_layers: list[int] = []
    def make_hook(idx: int):
        def hook(_mod, inputs):
            x = inputs[0] if isinstance(inputs, tuple) else inputs
            captures[idx] = x.detach()
        return hook
    for idx in target_layers:
        o_proj = _find_o_proj(layers_module[idx])
        if o_proj is not None:
            handles.append(o_proj.register_forward_pre_hook(make_hook(idx)))
            active_layers.append(idx)
    if not active_layers:
        return torch.zeros(0, 0, 0), []
    out_rows: list[torch.Tensor] = []
    n_batches = (len(texts) + batch_size - 1) // batch_size
    start = time.time()
    try:
        model.eval()
        with torch.no_grad():
            for b_idx, i in enumerate(range(0, len(texts), batch_size)):
                batch = texts[i : i + batch_size]
                tok = tokenizer(
                    batch,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=max_length,
                ).to(device)
                captures.clear()
                model(**tok)
                per_layer = [
                    _pool_last(captures[idx], tok["attention_mask"])
                    .to(torch.float32)
                    .cpu()
                    for idx in active_layers
                ]
                out_rows.append(torch.stack(per_layer, dim=1))
                del tok, captures
                if b_idx % 10 == 0:
                    torch.cuda.empty_cache()
                if b_idx % 5 == 0 or b_idx == n_batches - 1:
                    elapsed = time.time() - start
                    rate = (b_idx + 1) / elapsed if elapsed > 0 else 0
                    eta = (n_batches - b_idx - 1) / rate if rate > 0 else 0
                    print(
                        f"    [{label}] batch {b_idx + 1}/{n_batches} "
                        f"({elapsed:.0f}s elapsed, ~{eta:.0f}s remaining)",
                        flush=True,
                    )
                captures = {}
    finally:
        for h in handles:
            h.remove()
    return torch.cat(out_rows, dim=0), active_layers
 def _compute_per_head_ranking(
    emotions: list[str],
    attn_inputs: torch.Tensor,        # [n_stories, n_active_layers, hidden]
    baseline_attn_inputs: torch.Tensor,
    positives_by_emotion: dict[str, list[str]],
    text_to_row: dict[str, int],
    active_layers: list[int],
    n_heads_per_layer: dict[int, int],
    text_to_emotion: dict[str, str],
    unique_positive_texts: list[str],
 ) -> dict:
    """For each concept, rank attention heads by contribution magnitude.
    Per (concept, layer): reshape o_proj input to [n_heads, head_dim],
    compute diff-of-means between positives and negatives per head, rank
    heads by the L2 norm of that diff. The top heads are the ones most
    strongly implicated in the concept circuit.
    Why this matters: meta-relational concepts (trust, recognition,
    "seen") often don't give a strong residual-stream diff-of-means but
    DO give a strong per-head signal — the concept lives in a small
    attention circuit rather than in the residual-stream sum.
    """
    result: dict[str, dict] = {}
    for e_idx, emotion in enumerate(emotions):
        pos_rows = [text_to_row[t] for t in positives_by_emotion[emotion]]
        neg_rows = [
            i
            for i, t in enumerate(unique_positive_texts)
            if text_to_emotion[t] != emotion
        ]
        pos = attn_inputs[pos_rows]       # [n_pos, n_layers, hidden]
        neg = attn_inputs[neg_rows]
        if baseline_attn_inputs.shape[0] > 0:
            neg = torch.cat([neg, baseline_attn_inputs], dim=0)
        per_layer: dict[str, list] = {}
        for l_idx, target_l in enumerate(active_layers):
            n_heads = n_heads_per_layer.get(target_l)
            if not n_heads:
                continue
            hidden = pos.shape[-1]
            if hidden % n_heads != 0:
                continue
            head_dim = hidden // n_heads
            pos_l = pos[:, l_idx, :].view(-1, n_heads, head_dim)
            neg_l = neg[:, l_idx, :].view(-1, n_heads, head_dim)
            diff = pos_l.mean(dim=0) - neg_l.mean(dim=0)    # [n_heads, head_dim]
            head_norms = diff.norm(dim=-1)                   # [n_heads]
            # Normalise by neg variance per head so different-scale heads
            # don't dominate purely on activation magnitude.
            neg_std = neg_l.std(dim=0).norm(dim=-1).clamp_min(1e-6)
            head_selectivity = head_norms / neg_std          # [n_heads]
            k = min(10, n_heads)
            top_vals, top_idxs = head_selectivity.topk(k)
            top_heads = [
                [int(i), float(head_norms[i]), float(head_selectivity[i])]
                for i in top_idxs
            ]
            per_layer[str(target_l)] = {
                "n_heads": n_heads,
                "head_dim": head_dim,
                "top_heads": top_heads,  # [head_idx, raw_norm, selectivity]
                "head_concentration": float(
                    # fraction of total head-norm captured by top-k
                    head_norms[top_idxs].sum() / head_norms.sum().clamp_min(1e-6)
                ),
            }
        result[emotion] = {"per_layer": per_layer}
    return result
 def _get_n_heads_per_layer(model, target_layers: list[int]) -> dict[int, int]:
    """Best-effort read of num_attention_heads per layer. Qwen uses the
    top-level config; falls back to config.num_attention_heads.
    """
    cfg = model.config
    if hasattr(cfg, "get_text_config"):
        cfg = cfg.get_text_config()
    n = getattr(cfg, "num_attention_heads", None)
    if n is None:
        return {}
    return {l: n for l in target_layers}
 def _find_mlp_down_proj(model, layer_idx: int) -> torch.Tensor | None:
    """Return the W_down weight for the MLP at the given transformer layer.
@ -643,6 +840,49 @@ def main() -> None:
            positive_texts=unique_positive_texts,
            text_to_emotion=text_to_emotion,
        )
        # Per-head attention decomposition — second pass, captures
        # o_proj's input at each target layer and ranks heads per concept
        # by selectivity. Meta-relational concepts often live in specific
        # attention heads rather than the residual-stream sum; this
        # diagnostic surfaces that.
        print("\nCollecting o_proj inputs for per-head analysis...")
        attn_inputs, active_layers = _collect_attention_inputs(
            model, tokenizer, unique_positive_texts, target_layers, device,
            args.batch_size, args.max_length, label="attn-pos",
        )
        if active_layers and baselines:
            baseline_attn_inputs, _ = _collect_attention_inputs(
                model, tokenizer, baselines, active_layers, device,
                args.batch_size, args.max_length, label="attn-base",
            )
        else:
            baseline_attn_inputs = torch.zeros(0, len(active_layers), hidden_dim)
        if active_layers:
            n_heads_per_layer = _get_n_heads_per_layer(model, active_layers)
            per_head = _compute_per_head_ranking(
                emotions=emotions,
                attn_inputs=attn_inputs,
                baseline_attn_inputs=baseline_attn_inputs,
                positives_by_emotion=positives_by_emotion,
                text_to_row=text_to_row,
                active_layers=active_layers,
                n_heads_per_layer=n_heads_per_layer,
                text_to_emotion=text_to_emotion,
                unique_positive_texts=unique_positive_texts,
            )
            # Fold per-head into the main report under each concept.
            for emotion, ph in per_head.items():
                if emotion in report:
                    report[emotion]["per_head"] = ph["per_layer"]
            print(f"Per-head analysis done on layers {active_layers}")
        else:
            print(
                "No layer exposed a recognisable o_proj module path — "
                "per-head analysis skipped."
            )
        (output_dir / "quality.json").write_text(
            json.dumps(report, indent=2) + "\n"
        )