forked from kent/consciousness
234 lines
9.6 KiB
Python
234 lines
9.6 KiB
Python
|
|
"""Per-layer residual-stream delta SVD: δ_L = h_{L+1} - h_L stacked
|
|||
|
|
over all tokens in a calibration set. SVD gives us:
|
|||
|
|
|
|||
|
|
- top singular value per layer → γ_L (scalar magnitude, what Kirkpatrick fit)
|
|||
|
|
- top right-singular-vector per layer → v_L (direction in hidden space)
|
|||
|
|
- effective rank per layer → is this one direction or many?
|
|||
|
|
- pairwise v_L cos-sim across layers → are layers subspace-disjoint or -shared?
|
|||
|
|
|
|||
|
|
This directly tests the anisotropic-SA hypothesis:
|
|||
|
|
h_{L+1} = h_L + T_shared(h_L) + γ_L · v_L · f(...)
|
|||
|
|
|
|||
|
|
Phase C prediction: v_L vectors cover broad shared subspace (high mutual cos-sim,
|
|||
|
|
rank-few overall), δ_L is mostly noise around a shared update.
|
|||
|
|
Phase E prediction: v_L vectors are specialized (low pairwise cos-sim, each layer
|
|||
|
|
its own direction), effective rank of the block is close to N.
|
|||
|
|
|
|||
|
|
Qwen3-32B phases: A 0-6, B 7-9, C 10-31, D 32-46, E 47-58, tail 59-63.
|
|||
|
|
"""
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import numpy as np
|
|||
|
|
import torch
|
|||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||
|
|
|
|||
|
|
|
|||
|
|
CALIB = [
|
|||
|
|
"The Eiffel Tower is located in",
|
|||
|
|
"Photosynthesis is the process by which",
|
|||
|
|
"The three branches of the US government are the legislative, executive, and",
|
|||
|
|
"If a train travels 60 miles per hour for 2.5 hours, the total distance covered is",
|
|||
|
|
"Solve for x: 3x + 7 = 22. The answer is x =",
|
|||
|
|
"The derivative of x^3 + 2x^2 is",
|
|||
|
|
"def fibonacci(n):\n if n < 2:\n return n\n return",
|
|||
|
|
"# Python list comprehension to square even numbers in 0-9\nresult = ",
|
|||
|
|
"SELECT name, age FROM users WHERE",
|
|||
|
|
"She opened the old wooden box and found",
|
|||
|
|
"The argument in favor of renewable energy is",
|
|||
|
|
"User: What is the capital of Australia?\nAssistant:",
|
|||
|
|
"Write a haiku about autumn:\n",
|
|||
|
|
"Albert Einstein was born in the year",
|
|||
|
|
"The speed of light in vacuum is approximately",
|
|||
|
|
"I really loved that movie because",
|
|||
|
|
"The main difference between a virus and a bacterium is",
|
|||
|
|
"The French word for 'apple' is",
|
|||
|
|
"1 + 1 = ",
|
|||
|
|
"Once upon a time, in a land far away,",
|
|||
|
|
"The key insight of general relativity is that gravity is not a force but",
|
|||
|
|
"Water boils at 100 degrees Celsius at standard atmospheric pressure. At higher",
|
|||
|
|
"In object-oriented programming, encapsulation refers to",
|
|||
|
|
"The mitochondria is often called the powerhouse of the cell because it",
|
|||
|
|
"Shakespeare's Hamlet begins with the famous line",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def phase_of(L, num_layers):
|
|||
|
|
if num_layers == 64:
|
|||
|
|
if L <= 6: return "A"
|
|||
|
|
if L <= 9: return "B"
|
|||
|
|
if L <= 31: return "C"
|
|||
|
|
if L <= 46: return "D"
|
|||
|
|
if L <= 58: return "E"
|
|||
|
|
return "tail"
|
|||
|
|
frac = L / num_layers
|
|||
|
|
if frac < 0.11: return "A"
|
|||
|
|
if frac < 0.15: return "B"
|
|||
|
|
if frac < 0.5: return "C"
|
|||
|
|
if frac < 0.75: return "D"
|
|||
|
|
if frac < 0.92: return "E"
|
|||
|
|
return "tail"
|
|||
|
|
|
|||
|
|
|
|||
|
|
@torch.no_grad()
|
|||
|
|
def main():
|
|||
|
|
ap = argparse.ArgumentParser()
|
|||
|
|
ap.add_argument("--model", default="Qwen/Qwen3-32B")
|
|||
|
|
ap.add_argument("--out", default="/tmp/delta-svd.json")
|
|||
|
|
ap.add_argument("--top-k", type=int, default=8,
|
|||
|
|
help="keep top-k singular values / directions per layer")
|
|||
|
|
args = ap.parse_args()
|
|||
|
|
|
|||
|
|
print(f"Loading {args.model} ...", flush=True)
|
|||
|
|
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
|||
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|||
|
|
args.model, torch_dtype=torch.bfloat16, device_map="cuda",
|
|||
|
|
trust_remote_code=True, attn_implementation="eager",
|
|||
|
|
).eval()
|
|||
|
|
num_layers = model.config.num_hidden_layers
|
|||
|
|
hidden = model.config.hidden_size
|
|||
|
|
print(f" L={num_layers}, hidden={hidden}", flush=True)
|
|||
|
|
|
|||
|
|
# Concat calib and tokenize as one stream
|
|||
|
|
text = "\n\n".join(CALIB)
|
|||
|
|
enc = tok(text, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
|
|||
|
|
n_tok = enc.input_ids.shape[1]
|
|||
|
|
print(f" calibration tokens: {n_tok}", flush=True)
|
|||
|
|
|
|||
|
|
out = model(**enc, output_hidden_states=True, use_cache=False)
|
|||
|
|
# hidden_states: tuple of (num_layers+1) tensors, each (1, n_tok, hidden)
|
|||
|
|
hs = [h[0].float().cpu().numpy() for h in out.hidden_states]
|
|||
|
|
# hs[L] = residual stream entering layer L (or leaving layer L-1). So
|
|||
|
|
# δ_L = hs[L+1] - hs[L] is layer L's contribution.
|
|||
|
|
print(f" hidden_states count: {len(hs)} (expect {num_layers+1})", flush=True)
|
|||
|
|
del model, out
|
|||
|
|
torch.cuda.empty_cache()
|
|||
|
|
|
|||
|
|
# Per-layer SVD
|
|||
|
|
per_layer = []
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
delta = hs[L+1] - hs[L] # (n_tok, hidden)
|
|||
|
|
h_in = hs[L] # (n_tok, hidden)
|
|||
|
|
# Remove BOS / first-token artifacts (often outlier)
|
|||
|
|
delta = delta[1:]
|
|||
|
|
h_in = h_in[1:]
|
|||
|
|
n, d = delta.shape
|
|||
|
|
|
|||
|
|
# Norm per token
|
|||
|
|
token_norms = np.linalg.norm(delta, axis=1) # (n,)
|
|||
|
|
h_norms = np.linalg.norm(h_in, axis=1) # (n,)
|
|||
|
|
# Relative step size: ||δ_L|| / ||h_L||
|
|||
|
|
rel_step = (token_norms / np.maximum(h_norms, 1e-8))
|
|||
|
|
# Angle between δ and h, per token: cos = <δ, h> / (||δ||||h||)
|
|||
|
|
dot = np.einsum("nd,nd->n", delta, h_in)
|
|||
|
|
cos_delta_h = dot / np.maximum(token_norms * h_norms, 1e-8)
|
|||
|
|
# "Parallel" component: how much of δ points along ±h
|
|||
|
|
parallel_frac = np.abs(cos_delta_h).mean()
|
|||
|
|
|
|||
|
|
# SVD in economy mode (on CPU; 2047x5120 fits easy)
|
|||
|
|
U, S, Vt = np.linalg.svd(delta, full_matrices=False)
|
|||
|
|
# S: singular values, decreasing. Vt: right singular vectors (directions).
|
|||
|
|
|
|||
|
|
# Effective rank (entropy of normalized squared SVs)
|
|||
|
|
p = S**2 / (S**2).sum()
|
|||
|
|
p_nz = p[p > 1e-12]
|
|||
|
|
eff_rank = float(np.exp(-(p_nz * np.log(p_nz)).sum()))
|
|||
|
|
|
|||
|
|
# Energy concentration
|
|||
|
|
top1_frac = float(p[0])
|
|||
|
|
top3_frac = float(p[:3].sum())
|
|||
|
|
top10_frac = float(p[:min(10, len(p))].sum())
|
|||
|
|
|
|||
|
|
per_layer.append({
|
|||
|
|
"L": L,
|
|||
|
|
"phase": phase_of(L, num_layers),
|
|||
|
|
"frob": float(np.linalg.norm(delta)),
|
|||
|
|
"token_norm_mean": float(token_norms.mean()),
|
|||
|
|
"token_norm_std": float(token_norms.std()),
|
|||
|
|
"h_norm_mean": float(h_norms.mean()),
|
|||
|
|
"rel_step_mean": float(rel_step.mean()),
|
|||
|
|
"rel_step_std": float(rel_step.std()),
|
|||
|
|
"parallel_frac": float(parallel_frac),
|
|||
|
|
"cos_delta_h_mean": float(cos_delta_h.mean()),
|
|||
|
|
"top_singvals": S[:args.top_k].tolist(),
|
|||
|
|
"top_dirs": Vt[:args.top_k].astype(np.float32).tolist(),
|
|||
|
|
"eff_rank": eff_rank,
|
|||
|
|
"top1_frac": top1_frac,
|
|||
|
|
"top3_frac": top3_frac,
|
|||
|
|
"top10_frac": top10_frac,
|
|||
|
|
})
|
|||
|
|
print(f" L={L:>2} phase={phase_of(L, num_layers):>4} "
|
|||
|
|
f"||h||={h_norms.mean():>7.1f} "
|
|||
|
|
f"||δ||={token_norms.mean():>7.2f} "
|
|||
|
|
f"rel={rel_step.mean():.4f} "
|
|||
|
|
f"‖parallel‖={parallel_frac:.4f} "
|
|||
|
|
f"eff_rank={eff_rank:>6.2f}",
|
|||
|
|
flush=True)
|
|||
|
|
|
|||
|
|
# Pairwise cos-sim of top-1 directions across layers
|
|||
|
|
top1_dirs = np.array([pl["top_dirs"][0] for pl in per_layer]) # (L, d)
|
|||
|
|
top1_cos = top1_dirs @ top1_dirs.T # (L, L)
|
|||
|
|
|
|||
|
|
# Subspace principal angles: project each layer's top-k into others' span
|
|||
|
|
print(f"\n=== Pairwise top-1 cos-sim (adjacent) ===")
|
|||
|
|
for L in range(num_layers - 1):
|
|||
|
|
print(f" L={L:>2}→{L+1:>2} phase={phase_of(L, num_layers):>4} "
|
|||
|
|
f"|cos|={abs(top1_cos[L, L+1]):>.4f}")
|
|||
|
|
|
|||
|
|
# Per-phase summary: mean |cos| within phase vs cross-phase
|
|||
|
|
phase_members = {}
|
|||
|
|
for L in range(num_layers):
|
|||
|
|
phase_members.setdefault(phase_of(L, num_layers), []).append(L)
|
|||
|
|
|
|||
|
|
print(f"\n=== Per-phase top-1 direction overlap ===")
|
|||
|
|
print(f" {'phase':>6} {'N':>3} {'intra_cos_mean':>14} {'cross_cos_mean':>14}")
|
|||
|
|
for ph, Ls in phase_members.items():
|
|||
|
|
intra = abs(top1_cos[np.ix_(Ls, Ls)])
|
|||
|
|
if len(Ls) >= 2:
|
|||
|
|
intra_vals = intra[np.triu_indices(len(Ls), k=1)]
|
|||
|
|
intra_mean = float(intra_vals.mean())
|
|||
|
|
else:
|
|||
|
|
intra_mean = 1.0
|
|||
|
|
other_Ls = [L for L in range(num_layers) if L not in Ls]
|
|||
|
|
if other_Ls:
|
|||
|
|
cross = abs(top1_cos[np.ix_(Ls, other_Ls)])
|
|||
|
|
cross_mean = float(cross.mean())
|
|||
|
|
else:
|
|||
|
|
cross_mean = 0.0
|
|||
|
|
print(f" {ph:>6} {len(Ls):>3} {intra_mean:>14.4f} {cross_mean:>14.4f}")
|
|||
|
|
|
|||
|
|
# Subspace overlap: for each phase, find the block's overall principal subspace
|
|||
|
|
# and measure how much of each individual layer sits in it.
|
|||
|
|
print(f"\n=== Block-shared subspace (rank-8) capture fraction per layer ===")
|
|||
|
|
for ph, Ls in phase_members.items():
|
|||
|
|
if len(Ls) < 2:
|
|||
|
|
continue
|
|||
|
|
# Stack top-k directions from all layers in phase
|
|||
|
|
block_dirs = np.concatenate([per_layer[L]["top_dirs"] for L in Ls], axis=0)
|
|||
|
|
# SVD to get the shared basis of the union
|
|||
|
|
U_b, S_b, Vt_b = np.linalg.svd(block_dirs, full_matrices=False)
|
|||
|
|
shared_basis = Vt_b[:8] # top-8 shared directions of the block's top-k union
|
|||
|
|
# Project each layer's top-1 direction and measure capture
|
|||
|
|
for L in Ls:
|
|||
|
|
v1 = np.array(per_layer[L]["top_dirs"][0])
|
|||
|
|
capture = float((shared_basis @ v1).__pow__(2).sum())
|
|||
|
|
print(f" phase={ph:>4} L={L:>2} v1 captured by block top-8: {capture:.4f}")
|
|||
|
|
|
|||
|
|
# Save
|
|||
|
|
save = {
|
|||
|
|
"model": args.model,
|
|||
|
|
"num_layers": num_layers,
|
|||
|
|
"hidden": hidden,
|
|||
|
|
"n_calib_tokens": int(n_tok),
|
|||
|
|
"per_layer": [
|
|||
|
|
{k: v for k, v in pl.items() if k != "top_dirs"} # directions too big
|
|||
|
|
for pl in per_layer
|
|||
|
|
],
|
|||
|
|
"top1_cos_adjacent": [float(top1_cos[L, L+1]) for L in range(num_layers-1)],
|
|||
|
|
}
|
|||
|
|
with open(args.out, "w") as f:
|
|||
|
|
json.dump(save, f, indent=2)
|
|||
|
|
print(f"\nSaved: {args.out}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|