From 11a7e4043e0b3e58f56566b17b22e7b15a241e49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 22:15:31 -0400 Subject: [PATCH] scripts: FP8 quantize Qwen3.6-27B for vLLM (multimodal + MTP) Quantization recipe targeting the multimodal Qwen3.6-27B for vLLM serving. Three pitfalls the script avoids, each documented inline: 1. Loader strip: `AutoModelForCausalLM` silently drops the vision tower; we load via the config-declared `Qwen3_5ForConditionalGeneration` instead. 2. Pattern anchor: llmcompressor matches the `ignore` list against module names (no `.weight` suffix) when walking `named_modules()`, not against full tensor names. Patterns now anchor on `$` at the module name; the earlier `\.weight$` form silently quantized lm_head and every linear_attn projection. 3. vLLM fusion: vLLM fuses {q,k,v}_proj into qkv_proj, gate+up into gate_up_proj, and in_proj_qkv+in_proj_z into in_proj_qkvz. The compressed_tensors loader rejects mixed schemes within a fused layer, so the `ignore` list is shaped to keep all sub-components of a fused layer consistent. After `oneshot()` writes the FP8 output, MTP tensors (which the HF class doesn't expose) are spliced in at BF16 from the upstream cached snapshot, with the compressed_tensors metadata header preserved. Recipe follows Unsloth's UD-Q8_K_XL late-stack overrides (FFN: 50, 51, 59, 62, 63; ATTN: 51, 59, 63), extended to include `v_proj` for fusion compat. Final checkpoint is ~35 GB (matches Unsloth's GGUF size to within ~1%) with vision tower BF16, MTP head BF16, and most mlp/self_attn Linears at FP8_DYNAMIC. Co-Authored-By: Proof of Concept --- scripts/quantize_qwen3_6_mm.py | 327 +++++++++++++++++++++++++++++++++ 1 file changed, 327 insertions(+) create mode 100644 scripts/quantize_qwen3_6_mm.py diff --git a/scripts/quantize_qwen3_6_mm.py b/scripts/quantize_qwen3_6_mm.py new file mode 100644 index 0000000..844571e --- /dev/null +++ b/scripts/quantize_qwen3_6_mm.py @@ -0,0 +1,327 @@ +"""Quantize Qwen3.6-27B (multimodal) to FP8 for vLLM serving. + +Why this exists +--------------- +The earlier `quantize_qwen3_6.py` (in shell history, never committed) +loaded the model with `AutoModelForCausalLM`, which silently strips +the multimodal arch. Result: an FP8 checkpoint with no vision tower +weights at all. vLLM happily instantiated the vision tower from the +config and ran it with default/uninitialized weights, producing +gibberish image features and `!!!!!!`-style output. We chased that +through the protocol layer for a long time before tracing it back +to the quant. This script avoids that trap by loading via the +config-declared class explicitly. + +Recipe +------ +FP8_DYNAMIC (per-channel weight scales, per-token dynamic activation +scales, both E4M3) for Linear weights, with an `ignore` list derived +from Unsloth's UD-Q8_K_XL (`unsloth/Qwen3.6-27B-GGUF`). Their +sensitivity sweep flagged specific layers as quantization-fragile; +we honor those layer indices even though their algorithm is +GGUF-native Q8_K and ours is FP8 — sensitivity is a layer property, +not an algorithm property. + +vLLM fusion constraint +~~~~~~~~~~~~~~~~~~~~~~ +vLLM's Qwen3.5/3.6 model code fuses sub-modules at load time: + qkv_proj ← q_proj, k_proj, v_proj + gate_up_proj ← gate_proj, up_proj + in_proj_qkvz ← in_proj_qkv, in_proj_z + in_proj_ba ← in_proj_b, in_proj_a +compressed_tensors rejects checkpoints where sub-modules of a fused +layer have different quantization schemes. Our ignore list is shaped +around this — within any fused layer, all components share a scheme. +That's the reason `in_proj_qkv` is ignored even though Unsloth's +sweep doesn't single it out, and the reason late-stack attn override +covers q/k/v rather than just q/k. + +MTP merge +--------- +`Qwen3_5ForConditionalGeneration` doesn't expose the MTP submodule, +so `oneshot()` produces a checkpoint with the 15 `mtp.*` tensors +silently dropped. After quantization we read the MTP weights back +out of the upstream cached snapshot and splice them into the saved +safetensors at BF16. They're small (~850 MB) so quantizing them +isn't worth the calibration risk; speculative-decoding code paths +in vLLM expect the MTP head present. + +Output +------ +`OUTPUT_DIR` gets the FP8 model.safetensors + config + processor + +recipe.yaml. Vision tower stays BF16 (in `ignore`); LM Linears go +to FP8; norms, SSM internals (not Linear), and MTP tensors stay +BF16 untouched. + +Verification at end: re-opens the saved safetensors and asserts +- vision .weight tensors present (>= 150; full count is 167) +- lm_head + embed_tokens at fp16/bf16 (NOT FP8) +- a sampled FP8'd Linear actually has float8 dtype +- 15 mtp.* tensors present + +Run +--- + ~/vllm-venv/bin/python quantize_qwen3_6_mm.py +""" +from __future__ import annotations + +import glob +import json +import sys +from pathlib import Path + +import torch +from huggingface_hub import snapshot_download +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from safetensors import safe_open +from safetensors.torch import save_file +from transformers import AutoProcessor +from transformers.models.qwen3_5.modeling_qwen3_5 import ( + Qwen3_5ForConditionalGeneration, +) + + +MODEL = "Qwen/Qwen3.6-27B" +OUTPUT_DIR = "/home/ubuntu/amygdala-training/Qwen3.6-27B-FP8-mm" + + +# Layers Unsloth's UD-Q8_K_XL keeps at F16 (perplexity-sensitive +# in their sweep). Late-stack clustering is consistent with the +# general finding that errors near the output propagate directly +# to logits. +LATE_FFN_LAYERS = (50, 51, 59, 62, 63) +LATE_ATTN_LAYERS = (51, 59, 63) + + +# Build the ignore regex list. Note: llmcompressor matches these +# patterns against MODULE names (no `.weight` suffix) when walking +# `named_modules()` for `targets=["Linear"]`. The first pass of +# this script used `\.weight$` patterns and silently quantized +# lm_head + every linear_attn projection — verified post-hoc by +# inspecting the saved safetensors. Patterns now anchor on `$` +# at the module name. +IGNORE_PATTERNS: list[str] = [ + # Original recipe: lm_head and embeddings always full-precision. + # (embed_tokens is an Embedding, not a Linear, so it's already + # ignored by `targets=["Linear"]`. Pattern kept as belt-and- + # suspenders in case future llmcompressor versions widen the + # target set.) + "re:lm_head$", + "re:.*embed_tokens$", + + # Vision tower — entire `model.visual.*` subtree (vision + # transformer blocks + merger + patch_embed + pos_embed). + # Unsloth ships the vision tower as a separate `mmproj-BF16.gguf` + # for GGUF consumers; in our single-file FP8 setup we just leave + # them at BF16. + "re:model\\.visual\\..*", + + # MTP (multi-token prediction) module — Unsloth's GGUF doesn't + # carry MTP weights so we have no precision signal from them; + # safest to keep BF16. + "re:mtp\\..*", + + # Linear-attention block — keep ENTIRELY at BF16. vLLM fuses + # `in_proj_qkv` and `in_proj_z` into a single `in_proj_qkvz` + # layer, and compressed_tensors rejects mixed schemes within a + # fused layer. Unsloth's recipe keeps z, a, b, out at F16/F32 + # (gate/SSM internals are quantization-fragile in the GatedDeltaNet + # update), so the principled choice is to also keep `in_proj_qkv` + # at BF16 rather than FP8'ing the gate to match. We give up ~1 GB + # of FP8 coverage; in exchange we follow Unsloth's quality intent + # and load cleanly under vLLM. (`in_proj_a` + `in_proj_b` are + # likewise fused as `in_proj_ba` — both ignored, consistent.) + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_qkv$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_z$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_a$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_b$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.out_proj$", + + # Per-layer high-precision MLP (Unsloth flagged exactly these + # late-stack indices in their UD-Q8_K_XL sensitivity sweep, all + # three of {gate, up, down} per layer). vLLM fuses gate+up into + # `gate_up_proj`; ignoring both keeps the fused layer consistent. + # `down_proj` is its own (non-fused) layer. + "re:model\\.language_model\\.layers\\.(" + + "|".join(str(n) for n in LATE_FFN_LAYERS) + + ")\\.mlp\\.(down|gate|up)_proj$", + + # Per-layer high-precision attention q/k/v (Unsloth's sweep upgrades + # only q and k; we extend to v because vLLM fuses q/k/v into + # `qkv_proj` and rejects mixed schemes. `o_proj` is its own + # non-fused layer and stays at FP8. + "re:model\\.language_model\\.layers\\.(" + + "|".join(str(n) for n in LATE_ATTN_LAYERS) + + ")\\.self_attn\\.(q|k|v)_proj$", +] + + +def main() -> None: + print(f"Loading {MODEL} as multimodal " + f"(Qwen3_5ForConditionalGeneration)...", flush=True) + model = Qwen3_5ForConditionalGeneration.from_pretrained( + MODEL, + dtype=torch.bfloat16, + device_map="auto", + trust_remote_code=True, + ) + print(f" loaded: {model.__class__.__name__}", flush=True) + + print(f"Loading processor (text + image preprocessing)...", flush=True) + processor = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True) + + print("Running FP8_DYNAMIC oneshot quantization...", flush=True) + print(f" ignore list: {len(IGNORE_PATTERNS)} patterns", + flush=True) + recipe = QuantizationModifier( + targets=["Linear"], + scheme="FP8_DYNAMIC", + ignore=IGNORE_PATTERNS, + ) + oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR) + processor.save_pretrained(OUTPUT_DIR) + print(f" wrote model + processor to {OUTPUT_DIR}", flush=True) + + merge_mtp(OUTPUT_DIR) + verify_output(OUTPUT_DIR) + + +def merge_mtp(out_dir: str) -> None: + """Splice upstream MTP tensors into the saved FP8 safetensors. + + `Qwen3_5ForConditionalGeneration` skips the MTP submodule on load, + so oneshot's output is missing the 15 `mtp.*` tensors. We resolve + the upstream snapshot via the HF cache (already populated by + from_pretrained), pull just the MTP tensors out at BF16, and + rewrite the safetensors with them merged in. The compressed_tensors + metadata header (which carries the FP8 format identifier vLLM + needs to dequantize) is preserved verbatim. + + Atomic-rename is used so a crash mid-write doesn't corrupt the + 33+ GB checkpoint we just spent minutes producing. + """ + print("\nMerging upstream MTP tensors...", flush=True) + upstream_dir = Path(snapshot_download( + MODEL, + allow_patterns=["model.safetensors.index.json", + "model-*-of-*.safetensors"], + )) + + with open(upstream_dir / "model.safetensors.index.json") as f: + idx = json.load(f) + mtp_shards = sorted({v for k, v in idx["weight_map"].items() + if k.startswith("mtp.")}) + print(f" MTP tensors live in shards: {mtp_shards}", flush=True) + + mtp_tensors: dict[str, torch.Tensor] = {} + for shard in mtp_shards: + with safe_open(upstream_dir / shard, framework="pt") as f: + for k in f.keys(): + if k.startswith("mtp."): + mtp_tensors[k] = f.get_tensor(k).contiguous() + mtp_bytes = sum(t.numel() * t.element_size() + for t in mtp_tensors.values()) + print(f" loaded {len(mtp_tensors)} mtp tensors " + f"({mtp_bytes/1e6:.1f} MB)", flush=True) + + fp8_files = sorted(Path(out_dir).glob("*.safetensors")) + if len(fp8_files) != 1: + sys.exit(f"FAIL: expected single safetensors shard, " + f"got {fp8_files}") + existing_path = fp8_files[0] + + with safe_open(existing_path, framework="pt") as f: + metadata = f.metadata() or {} + all_tensors = {k: f.get_tensor(k) for k in f.keys()} + + overlap = set(all_tensors) & set(mtp_tensors) + if overlap: + sys.exit(f"FAIL: MTP key collision with FP8 output: " + f"{sorted(overlap)[:5]}") + all_tensors.update(mtp_tensors) + + tmp_path = existing_path.with_name(existing_path.name + ".new") + print(f" rewriting {existing_path.name} " + f"({len(all_tensors)} tensors)...", flush=True) + save_file(all_tensors, str(tmp_path), metadata=metadata) + tmp_path.replace(existing_path) + print(" done", flush=True) + + +def verify_output(out_dir: str) -> None: + """Open the saved safetensors and assert the recipe actually + landed: vision tower present at BF16, FP8 dtype on at least one + quantized Linear, lm_head not FP8.""" + print(f"\nVerifying {out_dir}...", flush=True) + + files = sorted(glob.glob(f"{out_dir}/*.safetensors")) + if not files: + sys.exit(f"FAIL: no safetensors in {out_dir}") + + vision_keys: list[tuple[str, str]] = [] + fp8_sample: tuple[str, str] | None = None + lm_head_dtype: str | None = None + mtp_keys: list[str] = [] + + for fp in files: + with safe_open(fp, framework="pt") as f: + for k in f.keys(): + if k.startswith("mtp."): + mtp_keys.append(k) + # Some FP8 quants write a sibling `_scale` / `_zero_point`; + # we just care about the .weight tensors. + if not k.endswith(".weight"): + continue + t = f.get_tensor(k) + dtype = str(t.dtype).replace("torch.", "") + if "model.visual." in k: + vision_keys.append((k, dtype)) + if k == "lm_head.weight": + lm_head_dtype = dtype + if (fp8_sample is None + and "float8" in dtype + and "language_model.layers" in k): + fp8_sample = (k, dtype) + + # Qwen3.6-27B has 167 vision `.weight` tensors (333 vision tensors + # total, the rest are `.bias` and per-block norms). 150 is a + # sanity floor that catches "vision tower didn't make it through" + # without being brittle to minor arch revisions. + if len(vision_keys) < 150: + sys.exit(f"FAIL: only {len(vision_keys)} vision tensors found " + f"(expected >= 150). Vision tower didn't make it " + f"through the quant.") + + bad_vision = [(k, d) for k, d in vision_keys if "float8" in d] + if bad_vision: + sys.exit(f"FAIL: vision weights got quantized to FP8: " + f"{bad_vision[:3]}...") + + if lm_head_dtype is None: + sys.exit("FAIL: lm_head.weight not found in output.") + if "float8" in lm_head_dtype: + sys.exit(f"FAIL: lm_head.weight is FP8 ({lm_head_dtype}); " + f"should be BF16/FP16.") + + if fp8_sample is None: + sys.exit("FAIL: no FP8 weights found in language_model.layers — " + "the recipe didn't quantize anything.") + + # Upstream Qwen3.6-27B has exactly 15 mtp.* tensors (1 fused + # transformer block + projection + norms). merge_mtp() should + # have spliced all of them in. + if len(mtp_keys) != 15: + sys.exit(f"FAIL: expected 15 mtp.* tensors, found " + f"{len(mtp_keys)}. merge_mtp() missed some.") + + print(f" ✓ {len(vision_keys)} vision tensors at " + f"{vision_keys[0][1]} (not FP8)") + print(f" ✓ lm_head.weight at {lm_head_dtype} (not FP8)") + print(f" ✓ FP8 sample: {fp8_sample[0]} = {fp8_sample[1]}") + print(f" ✓ {len(mtp_keys)} mtp.* tensors present") + print("DONE") + + +if __name__ == "__main__": + main()