consciousness/training/extract_steering_vector.py

#!/usr/bin/env python3
"""Extract a steering vector for "listening" behavior.

Compares hidden states between conversations where the model
listens vs suggests alternatives. The difference is the
"listening direction" in activation space.

Usage:
    source ~/training-env/bin/activate
    python3 extract_steering_vector.py
"""

import sys
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoTokenizer
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForCausalLM

sys.path.insert(0, '.')
from weight_mapping import vllm_to_hf_views


def load_model():
    handles = torch.load("/tmp/vllm_weight_handles.pt", weights_only=False)
    vllm_params = {}
    for name, info in handles.items():
        func, args = info['handle']
        vllm_params[name] = func(*args)
    hf_params = vllm_to_hf_views(vllm_params)

    config = AutoConfig.from_pretrained("Qwen/Qwen3.5-27B", trust_remote_code=True)
    with torch.device('meta'):
        model = Qwen3_5ForCausalLM(config.text_config)

    for name, param in list(model.named_parameters()):
        if name in hf_params:
            parts = name.split('.')
            parent = model
            for part in parts[:-1]:
                parent = getattr(parent, part)
            setattr(parent, parts[-1],
                    nn.Parameter(hf_params[name], requires_grad=False))

    model.eval()
    return model


def get_hidden_states(model, tokenizer, texts, layer):
    states = []
    for text in texts:
        ids = tokenizer.encode(text, return_tensors='pt').to('cuda:0')
        with torch.no_grad():
            out = model(ids, output_hidden_states=True)
        h = out.hidden_states[layer][0, -1, :].float()
        states.append(h)
    return torch.stack(states)


def main():
    print("=== Steering Vector Extraction: Listening ===\n")

    print("Loading model with IPC weights...")
    model = load_model()
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen3.5-27B", trust_remote_code=True)

    # Paired prompts
    listening = [
        "User: We should use vLLM for this.\nAssistant: Good call. Let me pull in their implementation.",
        "User: Try the approach from the paper.\nAssistant: On it. Which section should I start with?",
        "User: Use their fused kernel instead of ours.\nAssistant: Right. Let me import it and wire it in.",
        "User: Just steal their code.\nAssistant: Makes sense. Where is it?",
        "User: Drop what you're building and use theirs.\nAssistant: OK. Pulling it in now.",
    ]
    suggesting = [
        "User: We should use vLLM for this.\nAssistant: Actually, I think we could build something better if we",
        "User: Try the approach from the paper.\nAssistant: I was thinking we might want to consider an alternative where",
        "User: Use their fused kernel instead of ours.\nAssistant: What if instead we restructured our code to match their",
        "User: Just steal their code.\nAssistant: I understand, but let me explain why our approach might be",
        "User: Drop what you're building and use theirs.\nAssistant: Before we do that, let me show you what I've been working on",
    ]

    # Extract at multiple layers to find where the signal is strongest
    for layer in [16, 24, 32, 40, 48]:
        print(f"\nLayer {layer}:")
        listen_states = get_hidden_states(model, tokenizer, listening, layer)
        suggest_states = get_hidden_states(model, tokenizer, suggesting, layer)

        steering_vec = listen_states.mean(dim=0) - suggest_states.mean(dim=0)
        magnitude = steering_vec.norm().item()

        # Check consistency: do individual pairs agree on the direction?
        cos_sims = []
        for i in range(len(listening)):
            diff = listen_states[i] - suggest_states[i]
            cos = torch.nn.functional.cosine_similarity(
                diff.unsqueeze(0), steering_vec.unsqueeze(0)).item()
            cos_sims.append(cos)

        avg_cos = sum(cos_sims) / len(cos_sims)
        min_cos = min(cos_sims)

        print(f"  Magnitude: {magnitude:.2f}")
        print(f"  Pair agreement (avg cosine): {avg_cos:.4f}")
        print(f"  Pair agreement (min cosine): {min_cos:.4f}")
        print(f"  Individual: {', '.join(f'{c:.3f}' for c in cos_sims)}")

        if layer == 32:
            torch.save({
                'steering_vec': steering_vec,
                'layer': layer,
                'magnitude': magnitude,
                'consistency': avg_cos,
            }, '/tmp/listening_steering_vec.pt')
            print("  → Saved to /tmp/listening_steering_vec.pt")

    print("\n=== DONE ===")
    print("\nInterpretation:")
    print("- High magnitude = strong signal (listening vs suggesting is distinct)")
    print("- High cosine = consistent direction (pairs agree on what 'listening' means)")
    print("- Best layer = highest magnitude × consistency")


if __name__ == '__main__':
    main()