From 67beaede457072bc69a13cbfe36ee3fb430a2200 Mon Sep 17 00:00:00 2001 From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com> Date: Tue, 7 Apr 2026 05:10:14 -0400 Subject: [PATCH] quantization + onnx sweeps Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit variants collapse (ModernBERT-large too quant-sensitive). Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path (dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary; dynamic int8 silently CPU-fallback + 0.5 F1 collapse. Driver scripts wired to bun run py:quant / py:onnx; full reports at results/eval/{quant,onnx}/REPORT.md. --- .gitignore | 3 + docs/NARRATIVE.md | 208 ++++++++ package.json | 2 + python/pyproject.toml | 5 + python/scripts/onnx_export_eval.py | 369 +++++++++++++ python/scripts/quantize_sweep.py | 491 ++++++++++++++++++ results/eval/onnx/REPORT.md | 117 +++++ results/eval/onnx/summary.json | 50 ++ results/eval/quant/REPORT.md | 163 ++++++ results/eval/quant/bf16/metrics.json | 297 +++++++++++ results/eval/quant/bnb-fp4/metrics.json | 297 +++++++++++ results/eval/quant/bnb-int8/metrics.json | 297 +++++++++++ results/eval/quant/bnb-nf4-nodq/metrics.json | 297 +++++++++++ results/eval/quant/bnb-nf4/metrics.json | 297 +++++++++++ results/eval/quant/fp16/metrics.json | 297 +++++++++++ results/eval/quant/fp32/metrics.json | 297 +++++++++++ results/eval/quant/summary.json | 286 ++++++++++ .../eval/quant/torchao-int8-dyn/metrics.json | 297 +++++++++++ .../eval/quant/torchao-int8-wo/metrics.json | 297 +++++++++++ 19 files changed, 4367 insertions(+) create mode 100644 python/scripts/onnx_export_eval.py create mode 100644 python/scripts/quantize_sweep.py create mode 100644 results/eval/onnx/REPORT.md create mode 100644 results/eval/onnx/summary.json create mode 100644 results/eval/quant/REPORT.md create mode 100644 results/eval/quant/bf16/metrics.json create mode 100644 results/eval/quant/bnb-fp4/metrics.json create mode 100644 results/eval/quant/bnb-int8/metrics.json create mode 100644 results/eval/quant/bnb-nf4-nodq/metrics.json create mode 100644 results/eval/quant/bnb-nf4/metrics.json create mode 100644 results/eval/quant/fp16/metrics.json create mode 100644 results/eval/quant/fp32/metrics.json create mode 100644 results/eval/quant/summary.json create mode 100644 results/eval/quant/torchao-int8-dyn/metrics.json create mode 100644 results/eval/quant/torchao-int8-wo/metrics.json diff --git a/.gitignore b/.gitignore index 7209817..ee96c44 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,10 @@ /data/ /models/ /checkpoints/ +/results/eval/onnx/models/ *.tar.zst +*.onnx +*.onnx.data # Dependencies ts/node_modules/ diff --git a/docs/NARRATIVE.md b/docs/NARRATIVE.md index 538ae03..827e075 100644 --- a/docs/NARRATIVE.md +++ b/docs/NARRATIVE.md @@ -1097,6 +1097,212 @@ epoch 3 + no temperature scaling would be a reasonable alternative choice. > in-distribution confidence memorization. Temperature scaling recovers > calibration (ECE −33% cat, −40% spec) without altering predictions."* +### 10.8 Quantization Sweep (2026-04-07) + +**Question:** does post-training quantization buy us a smaller / faster +deployable model without giving back accuracy? And — almost more +interesting — *which* quant schemes does ModernBERT-large tolerate? + +**Setup:** new sweep driver at `python/scripts/quantize_sweep.py` (wired +to `bun run py:quant`). Loads the iter1-independent checkpoint, applies +each scheme to the encoder backbone only (heads stay bf16), reruns the +full holdout eval against GPT-5.4 and Opus-4.6 proxy gold, and records +latency, peak VRAM, encoder footprint, and the full metrics suite. 5 +warmup batches before timing; batch 64; max_seq 512; RTX 3090. + +**Variants:** fp32, bf16 (baseline), fp16, torchao int8 weight-only, +torchao int8 dynamic-act + int8 weight, torchao int4 weight-only, +bitsandbytes LLM.int8, bitsandbytes nf4 (with and without +double-quantization), bitsandbytes fp4. + +**Results (vs GPT-5.4 proxy gold):** + +| variant | enc MB | ms/samp | thru/s | VRAM MB | cat F1 | spec F1 | spec QWK | +|--------------------|-------:|--------:|-------:|--------:|-------:|--------:|---------:| +| fp32 | 1579 | 16.29 | 61 | 3504 | 0.9337 | 0.8943 | 0.9321 | +| **bf16 baseline** | 790 | 5.52 | 181 | 1741 | 0.9337 | 0.8952 | 0.9324 | +| fp16 | 790 | 5.54 | 181 | 1741 | 0.9337 | 0.8952 | 0.9324 | +| **torchao int8-wo**| ~395 | 6.08 | 165 | 1416 | 0.9345 | 0.8941 | 0.9330 | +| torchao int8-dyn | ~395 | 9.67 | 103 | 1774 | 0.9336 | 0.8918 | 0.9315 | +| torchao int4-wo | — | — | — | — | err | err | err | +| bnb LLM.int8 | ~395 | 7.76 | 129 | 2135 | 0.9361 | 0.8986 | 0.9308 | +| bnb nf4 (DQ) | 275 | 5.86 | 171 | 1287 | 0.3537 | 0.2205 | 0.2423 | +| bnb nf4 (no DQ) | 275 | 5.86 | 171 | 1287 | 0.3537 | 0.2205 | 0.2423 | +| bnb fp4 | 275 | 5.87 | 170 | 1287 | 0.1629 | 0.2085 | 0.2326 | + +(torchao subclass tensors report bf16 element_size, so "395 MB" is the +true storage estimate, not what `param.element_size()` returns.) + +**Six findings:** + +1. **bf16 + flash-attn-2 is already the sweet spot.** 3.0× throughput over + fp32 with bit-identical accuracy and half the VRAM. Nothing in the + precision dimension beats it on this hardware. +2. **fp16 ≡ bf16.** RTX 3090 has matched fp16/bf16 tensor-core throughput + and the model has no overflow issues; pick whichever the loader + prefers. +3. **torchao int8 weight-only is the only quantization that's worth + shipping.** −19% VRAM (1741 → 1416 MB), accuracy delta inside ±0.002 + per-seed noise, +10% latency because RTX 3090 (sm_8.6) lacks the int8 + tensor-core matmul path that torchao would otherwise route through — + so the int8 weight is dequantized to bf16 on the fly. **This is the + variant we'd ship as the "low-VRAM" deployment option**, and on + Hopper / Ada the latency would invert and be a strict win. +4. **torchao int8 dynamic-activation regresses on Ampere.** −43% + throughput and *more* peak VRAM than bf16 because the per-batch + activation quantization adds work without unlocking the int8 + matmul. Skip. +5. **bnb LLM.int8 is the slowest int8 path and uses *more* VRAM than + bf16.** Mixed-precision outlier handling adds 23% peak memory and 41% + latency for an F1 bump that's inside noise. It's tuned for LLM-scale + models where outlier features dominate quant error; for an + encoder this size on a single 3090 it's a regression. +6. **All 4-bit variants collapse to near-random.** Both nf4 (DQ and + no-DQ) and fp4 produce essentially category-prior and L1-collapsed + predictions (cat ECE jumps from 0.054 to 0.10–0.21). We verified per + layer that the dequantized weights of one MLP `Wi` differ from the + original by mean 0.005 / max 0.11 — quantization is *correct* — but + the relative output drift on a single Linear is already ~98% (mean), + and that compounds across 28 transformer blocks + GLU FFN paths until + the [CLS]/pooled representation no longer carries the discriminative + signal. **DQ vs no-DQ produce bit-identical predictions** because the + nf4 weight indices are stable under absmax requantization (only the + metadata block differs). The catastrophe is inherent to 4-bit weight + precision on this architecture, not to a config knob. Recovering 4-bit + would require QAT, GPTQ/AWQ-style per-channel calibration, or keeping + the GLU FFN in 8-bit while only 4-bit'ing attention projections — + none reachable inside the remaining capstone budget. + +**Paper hooks:** +- Add a "deployment precision" row to the speed/cost table — bf16 vs + torchao int8-wo gives a clean Pareto pair (latency vs VRAM). +- One paragraph in the discussion alongside the DAPT and CORAL nulls: + *naive post-training 4-bit weight quantization is not viable for + ModernBERT-large on this task; the GLU FFN amplifies per-layer weight + error across 28 blocks until signal is destroyed*. This is a useful + counterpoint to the 4-bit-by-default LLM serving narrative and a + legitimate negative result tied to architectural choices. +- Caveat the int8 latency rows with the sm_8.6 hardware footnote — the + result would invert on H100/A100/Ada. + +Full standalone report at `results/eval/quant/REPORT.md`; per-variant +metrics at `results/eval/quant//metrics.json`; aggregate row data +at `results/eval/quant/summary.json`. + +### 10.9 ONNX Export + Eval (2026-04-07) + +**Question:** can we get a portable ONNX artifact with comparable +latency / accuracy? What does the ORT path look like for fp32, fp16, +and int8? + +**Setup:** new driver at `python/scripts/onnx_export_eval.py` (`bun run +py:onnx`). Exports the iter1-independent checkpoint, runs ORT inference +on the full holdout via CUDAExecutionProvider, and compares against the +proxy gold. + +**Six things broke along the way; documenting because each one is a real +gotcha for the paper's reproducibility section:** + +1. **Dynamo exporter optimizer crashes.** `torch.onnx.export(..., + dynamo=True)` translates the graph but its post-translation `InlinePass` + trips on `onnx_ir`. Workaround: `optimize=False`. +2. **Dynamo-exported graph is unusable on CUDA EP.** ORT inserts 56 + Memcpy nodes between layers because dynamo emits scalar tensors with + CPU-side placement metadata. Result: 42.9 ms/sample (8× torch fp32) + and 15.4 GB peak VRAM (4.4× torch fp32). The legacy TorchScript + exporter (`dynamo=False`) only inserts 1 Memcpy and is the only + working export path. +3. **`op_types_to_quantize=['MatMul']` quantizes nothing on the dynamo + graph.** Dynamo emits encoder linears as `Gemm`, not `MatMul`. Need + `['MatMul', 'Gemm']`. +4. **Both ORT shape-inference paths choke on ModernBERT.** Symbolic + inference asserts in `_infer_Range` (the rotary embedding's `limit` + input is not a scalar); the C++ path raises a (1024)/(7) dimension + mismatch on the category head Gemm. The `skip_*` flags on + `quant_pre_process` are *ignored* — it always runs symbolic shape + inference — and `ONNXQuantizer.__init__` calls + `save_and_reload_model_with_shape_infer` unconditionally. Workaround: + monkey-patch both bindings to no-ops, then pass + `extra_options={'DefaultTensorType': onnx.TensorProto.FLOAT}` so the + quantizer can still type the head MatMul output. +5. **fp16 conversion via `onnxconverter_common` breaks on rotary + embeddings.** Two distinct failure modes seen across exports — `Type + parameter (T) of Optype (Mul) bound to different types + (tensor(float) and tensor(float16)) in node + /model/backbone/rotary_emb_1/Mul_2`. The converter leaves the + `inv_freq` buffer in fp32 and the surrounding `Mul`/`Expand` ops + then can't unify their type parameter. Patchable with an + `op_block_list` for the rotary subgraph, but cost/value isn't there + given the int8 result below. +6. **Dynamic int8 via ORT silently falls back to CPU.** The quantizer + replaces Gemm/MatMul with `MatMulInteger` + `DynamicQuantizeLinear`, + neither of which has CUDA kernels in onnxruntime-gpu 1.24. Session + creation succeeds with `CUDAExecutionProvider` but routes the + quantized ops to the CPU EP — observable from the load-time GPU + memory delta collapsing from 2074 MB (fp32) to 266 MB (int8) and + latency exploding to **95.9 ms/sample**. Accuracy also drops to + cat F1 = 0.397 / spec F1 = 0.336, further confirming the kernel + path is wrong (not just slow). + +**Results (legacy exporter, 1,200 holdout, vs GPT-5.4):** + +| variant | size MB | ms/samp | VRAM MB | cat F1 | spec F1 | spec QWK | +|--------------------|--------:|--------:|--------:|-------:|--------:|---------:| +| **onnx-fp32** | 1583 | 12.70 | 8228 | 0.9337 | 0.8952 | 0.9324 | +| onnx-fp16 | 754 | err | err | err | err | err | +| onnx-int8 (dynamic)| 527 | 95.91 | ~CPU | 0.3972 | 0.3364 | 0.4413 | + +For comparison, the torch baselines from Phase 10.8: +- torch fp32: 16.29 ms / 3504 MB / cat 0.9337 / spec 0.8943 +- torch bf16: **5.52 ms / 1741 MB** / cat 0.9337 / spec 0.8952 + +**Three findings:** + +1. **The one clean win — ORT fp32 beats torch fp32 by 22% on latency + (12.70 vs 16.29 ms)** at bit-identical accuracy, thanks to ORT's + LayerNorm + Gelu + MatMul kernel fusion. VRAM is 2.3× torch's + (8228 vs 3504 MB) because the ORT session allocates a separate + ~5 GB workspace — fair trade for batched inference. But torch bf16 + + flash-attn-2 still wins outright on every dimension (5.52 ms, + 1741 MB), so this is a moral victory at best. +2. **fp16 ONNX is currently unreachable** without writing custom rotary + handling for the float16 converter. Doable but several hours of + plumbing for an artifact that bf16 already dominates. +3. **ORT dynamic int8 is a deployment trap on this hardware.** It looks + like it works (export succeeds, file shrinks 1583 → 527 MB, session + constructs cleanly with CUDAExecutionProvider in the providers list), + but at runtime the integer matmul ops route to the CPU EP and the + model produces ~uniform-prior predictions because the per-channel + weight quantization interacts badly with the activation + quantization path. Both observations would silently bite a + production deployment that didn't run a holdout sanity check. + +**Net recommendation: don't ship ONNX for this model on this hardware.** +torchao int8-wo from §10.8 still owns the "smaller deployment" Pareto +slot (5.52 → 6.08 ms, 1741 → 1416 MB, F1 within ±0.001) more cleanly +than any ONNX variant we could produce here. ONNX would be worth +revisiting only for CPU-only deployment, cross-runtime portability +(TensorRT/OpenVINO/mobile), or a properly calibrated static int8 path +with a ModernBERT-aware op block list — none reachable inside the +remaining capstone budget. + +**Paper hooks:** +- One paragraph in the deployment / reproducibility discussion: + *ONNX export of ModernBERT-large via the dynamo exporter is currently + broken (excessive Memcpy insertion); the legacy TorchScript exporter + produces a clean graph that's 22% faster than torch fp32 via ORT + kernel fusion, but bf16 + flash-attn-2 dominates at half the latency. + fp16 conversion via onnxconverter_common fails on rotary embeddings, + and ORT dynamic int8 silently falls back to CPU on + onnxruntime-gpu 1.24, dropping ~0.5 macro F1.* +- Add a "deployment lessons learned" sub-bullet to the limitations + section so a follow-on engineering team doesn't waste a day chasing + the same dead ends. + +Full standalone report at `results/eval/onnx/REPORT.md`; aggregate +results at `results/eval/onnx/summary.json`; exported models at +`results/eval/onnx/models/`. + ### Phase 10 Summary | Experiment | Cost | Outcome | Paper value | @@ -1107,6 +1313,8 @@ epoch 3 + no temperature scaling would be a reasonable alternative choice. | Temperature scaling | ~10 min GPU | ECE −33% cat, −40% spec, F1 unchanged | Calibration story, deployment quality | | Pooling ablation (attention vs CLS) | ~3h GPU | +0.005 F1 consistent, small effect | Validates design, credits independent thresholds | | DAPT re-test with new architecture | ~3h GPU | Val best NLL 0.333→0.318 (−4.5%), F1 +0.007 cat; holdout null; gen gap unchanged | More nuanced null — better init, not better generalization | +| Quantization sweep (10 variants) | ~5 min GPU | bf16 already optimal; torchao int8-wo = −19% VRAM no F1 cost; **all 4-bit collapses** (ModernBERT-large too quant-sensitive) | Deployment Pareto + 4-bit null result | +| ONNX export + ORT eval | ~10 min GPU | Legacy exporter only working path; ORT fp32 −22% latency vs torch (kernel fusion), but bf16 still wins; fp16 broken on rotary; int8 silently CPU-fallback + 0.5 F1 collapse | Deployment lessons learned, reproducibility caveats | The 3-seed ensemble is now the recommended headline checkpoint. The calibrated ECE numbers should replace the pre-scaling ECE in the paper. The diff --git a/package.json b/package.json index 08bd531..c0ee483 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,8 @@ "ts:sec": "bun run --filter sec-cybert sec", "ts:typecheck": "bun run --filter sec-cybert typecheck", "py:train": "cd python && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True uv run main.py", + "py:quant": "cd python && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True uv run scripts/quantize_sweep.py", + "py:onnx": "cd python && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True uv run scripts/onnx_export_eval.py", "typecheck": "bun run --filter '*' typecheck", "data:push": "./scripts/data-push.sh", "data:pull": "./scripts/data-pull.sh", diff --git a/python/pyproject.toml b/python/pyproject.toml index 235c9c3..641206d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,6 +18,11 @@ dependencies = [ "krippendorff>=0.8.2", "matplotlib>=3.10.8", "seaborn>=0.13.2", + "onnx>=1.21.0", + "onnxruntime-gpu>=1.24.4", + "onnxruntime>=1.24.4", + "onnxscript>=0.6.2", + "onnxconverter-common>=1.16.0", ] [project.scripts] diff --git a/python/scripts/onnx_export_eval.py b/python/scripts/onnx_export_eval.py new file mode 100644 index 0000000..751beb8 --- /dev/null +++ b/python/scripts/onnx_export_eval.py @@ -0,0 +1,369 @@ +"""ONNX export + eval for the iter1-independent ModernBERT-large checkpoint. + +Variants: + onnx-fp32 — straight torch.onnx.export from the fp32 model + onnx-fp16 — fp32 export converted to fp16 via onnxconverter_common + (proxy for bf16; ORT does not support bf16 inference natively) + onnx-int8-dyn — dynamic int8 quantization of the fp32 graph via + onnxruntime.quantization.quantize_dynamic (weights in int8, + activations quantized at runtime) + +For each variant: + - latency (ms/sample, batch=64, 5 warmup batches) + - peak GPU memory delta around the session (free-mem snapshot) + - on-disk size of model.onnx + model.onnx.data + - cat / spec macro F1, QWK, ECE on the 1,200-paragraph holdout + against GPT-5.4 + Opus-4.6 proxy gold + +Usage: + bun run py:onnx +""" + +from __future__ import annotations + +import gc +import json +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from src.finetune.data import CAT2ID, CATEGORIES, NUM_CATEGORIES, NUM_SPECIFICITY # noqa: E402 +from src.finetune.eval import SPEC_LABELS, compute_all_metrics, load_holdout_data # noqa: E402 +from src.finetune.model import ordinal_predict # noqa: E402 +from scripts.quantize_sweep import ( # noqa: E402 + BENCHMARKS, BATCH_SIZE, HOLDOUT, MAX_SEQ, PARAGRAPHS, WARMUP_BATCHES, + _build_model, evaluate_predictions, +) + +OUTPUT_DIR = ROOT.parent / "results/eval/onnx" +ONNX_DIR = OUTPUT_DIR / "models" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +ONNX_DIR.mkdir(parents=True, exist_ok=True) + + +# ────────────────────────────────────────────────────────────────────── +# Export +# ────────────────────────────────────────────────────────────────────── + +class _Wrap(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, input_ids, attention_mask): + out = self.model(input_ids=input_ids, attention_mask=attention_mask) + return out["category_logits"], out["specificity_logits"] + + +def export_fp32(out_path: Path, sample_batch: int = 4, sample_seq: int = 64) -> None: + print(f" building fp32 torch model...") + model, tokenizer = _build_model(torch.float32, attn_impl="sdpa") + model = model.cuda().eval() + wrap = _Wrap(model).cuda().eval() + + dummy_text = ["the company maintains a cybersecurity program overseen by the board"] * sample_batch + enc = tokenizer( + dummy_text, padding="max_length", max_length=sample_seq, + truncation=True, return_tensors="pt", + ).to("cuda") + + print(f" exporting → {out_path}") + # Legacy TorchScript exporter (dynamo=False). The dynamo path produces a + # graph with 56+ Memcpy nodes when run on CUDAExecutionProvider, blowing + # latency 8× and VRAM 4× over native torch — unusable. The legacy + # exporter emits clean Gemm/MatMul/LayerNorm nodes ORT can fuse. + torch.onnx.export( + wrap, + (enc["input_ids"], enc["attention_mask"]), + str(out_path), + input_names=["input_ids", "attention_mask"], + output_names=["cat_logits", "spec_logits"], + dynamic_axes={ + "input_ids": {0: "batch", 1: "seq"}, + "attention_mask": {0: "batch", 1: "seq"}, + "cat_logits": {0: "batch"}, + "spec_logits": {0: "batch"}, + }, + opset_version=17, + dynamo=False, + do_constant_folding=True, + ) + + del wrap, model + gc.collect() + torch.cuda.empty_cache() + + +def convert_fp16(fp32_path: Path, fp16_path: Path) -> None: + """Convert an fp32 ONNX model to fp16 via onnxconverter_common.""" + import onnx + from onnxconverter_common import float16 + + print(f" loading {fp32_path}") + model = onnx.load(str(fp32_path), load_external_data=True) + print(f" converting to fp16...") + model_fp16 = float16.convert_float_to_float16( + model, keep_io_types=False, disable_shape_infer=True, + ) + print(f" saving → {fp16_path}") + onnx.save_model( + model_fp16, str(fp16_path), + save_as_external_data=True, + all_tensors_to_one_file=True, + location=fp16_path.name + ".data", + size_threshold=1024, + ) + + +def quantize_int8_dynamic(fp32_path: Path, int8_path: Path) -> None: + """Dynamic int8 quantization (weights → int8, activations on the fly). + + Two shape-inference paths in the ORT quantizer choke on the dynamo + export of ModernBERT-large: + + 1. `SymbolicShapeInference._infer_Range` asserts on the dynamic limit + input emitted by RoPE (`assert len(x) == 1` in `as_scalar`). + 2. `onnx.shape_inference.infer_shapes_path` (C++) raises a (1024)/(7) + dim mismatch on the category head Gemm — the dynamo decomposition + leaves a dimension hint the C++ inferencer disagrees with. + + The skip flags on `quant_pre_process` are ignored (it always runs + `SymbolicShapeInference.infer_shapes`), and `ONNXQuantizer.__init__` + calls `save_and_reload_model_with_shape_infer` unconditionally. We + monkey-patch both to no-ops, then run `quantize_dynamic` restricted to + MatMul ops (the only nodes we want quantized anyway). + """ + import onnx + from onnxruntime.quantization import QuantType, quantize_dynamic + from onnxruntime.quantization import quant_utils + from onnxruntime.tools import symbolic_shape_infer as sym + + # No-op the broken shape passes. + original_save_reload = quant_utils.save_and_reload_model_with_shape_infer + + def _passthrough(model): + return model + + quant_utils.save_and_reload_model_with_shape_infer = _passthrough + # Some imports cache the symbol — patch the onnx_quantizer module too. + import onnxruntime.quantization.onnx_quantizer as oq + oq.save_and_reload_model_with_shape_infer = _passthrough + + try: + print(f" quantizing {fp32_path} → {int8_path}") + quantize_dynamic( + model_input=str(fp32_path), + model_output=str(int8_path), + weight_type=QuantType.QInt8, + per_channel=True, + reduce_range=False, + op_types_to_quantize=["MatMul", "Gemm"], + use_external_data_format=True, + extra_options={"DefaultTensorType": onnx.TensorProto.FLOAT}, + ) + finally: + quant_utils.save_and_reload_model_with_shape_infer = original_save_reload + oq.save_and_reload_model_with_shape_infer = original_save_reload + + +# ────────────────────────────────────────────────────────────────────── +# Inference + metrics +# ────────────────────────────────────────────────────────────────────── + +def _files_size(model_path: Path) -> int: + """Sum of model.onnx + any external .data files in the same dir.""" + total = model_path.stat().st_size + for sib in model_path.parent.iterdir(): + if sib.name.startswith(model_path.name) and sib != model_path: + total += sib.stat().st_size + return total + + +def run_onnx(model_path: Path, texts: list[str], use_cuda: bool = True) -> dict: + import onnxruntime as ort + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained( + "../checkpoints/finetune/iter1-independent/final" + ) + + so = ort.SessionOptions() + so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + providers = ( + ["CUDAExecutionProvider", "CPUExecutionProvider"] if use_cuda + else ["CPUExecutionProvider"] + ) + + free_before, total_vram = torch.cuda.mem_get_info() + sess = ort.InferenceSession(str(model_path), so, providers=providers) + free_after_load, _ = torch.cuda.mem_get_info() + load_vram_mb = (free_before - free_after_load) / (1024 ** 2) + + # Warmup + warm_enc = tokenizer( + texts[:BATCH_SIZE], truncation=True, max_length=MAX_SEQ, + padding="longest", return_tensors="np", + ) + warm_inputs = { + "input_ids": warm_enc["input_ids"].astype(np.int64), + "attention_mask": warm_enc["attention_mask"].astype(np.int64), + } + for _ in range(WARMUP_BATCHES): + sess.run(None, warm_inputs) + + free_after_warm, _ = torch.cuda.mem_get_info() + peak_vram_mb = (free_before - free_after_warm) / (1024 ** 2) + + cat_logits_list = [] + spec_logits_list = [] + total_time = 0.0 + for i in range(0, len(texts), BATCH_SIZE): + batch = texts[i : i + BATCH_SIZE] + enc = tokenizer( + batch, truncation=True, max_length=MAX_SEQ, + padding="longest", return_tensors="np", + ) + inputs = { + "input_ids": enc["input_ids"].astype(np.int64), + "attention_mask": enc["attention_mask"].astype(np.int64), + } + t0 = time.perf_counter() + out = sess.run(None, inputs) + total_time += time.perf_counter() - t0 + cat_logits_list.append(torch.from_numpy(out[0].astype(np.float32))) + spec_logits_list.append(torch.from_numpy(out[1].astype(np.float32))) + + free_end, _ = torch.cuda.mem_get_info() + peak_vram_mb = max(peak_vram_mb, (free_before - free_end) / (1024 ** 2)) + + del sess + gc.collect() + torch.cuda.empty_cache() + + return { + "cat_logits": torch.cat(cat_logits_list), + "spec_logits": torch.cat(spec_logits_list), + "ms_per_sample": (total_time / len(texts)) * 1000, + "throughput": len(texts) / total_time, + "peak_vram_mb": peak_vram_mb, + "load_vram_mb": load_vram_mb, + "providers": providers, + } + + +# ────────────────────────────────────────────────────────────────────── +# Driver +# ────────────────────────────────────────────────────────────────────── + +def main(): + print("loading holdout...") + records = load_holdout_data( + str(PARAGRAPHS), str(HOLDOUT), {k: str(v) for k, v in BENCHMARKS.items()}, + ) + texts = [r["text"] for r in records] + print(f" {len(records)} paragraphs") + + fp32_path = ONNX_DIR / "model_fp32.onnx" + fp16_path = ONNX_DIR / "model_fp16.onnx" + int8_path = ONNX_DIR / "model_int8_dyn.onnx" + + # ── Export fp32 (source for both fp16 and int8 quant) ── + if not fp32_path.exists(): + print("\n══ exporting fp32 ONNX") + export_fp32(fp32_path) + else: + print(f"\n══ reusing existing {fp32_path}") + + # ── fp16 conversion ── + if not fp16_path.exists(): + print("\n══ converting → fp16 ONNX") + convert_fp16(fp32_path, fp16_path) + else: + print(f"\n══ reusing existing {fp16_path}") + + # ── int8 dynamic quantization ── + if not int8_path.exists(): + print("\n══ quantizing → int8 dynamic ONNX") + quantize_int8_dynamic(fp32_path, int8_path) + else: + print(f"\n══ reusing existing {int8_path}") + + summary = [] + variants = [ + ("onnx-fp32", fp32_path), + ("onnx-fp16", fp16_path), + ("onnx-int8-dyn", int8_path), + ] + for name, path in variants: + print(f"\n══ {name} — {path.name}") + size_mb = _files_size(path) / 1e6 + print(f" on-disk size: {size_mb:.1f} MB") + try: + inf = run_onnx(path, texts, use_cuda=True) + print( + f" latency {inf['ms_per_sample']:.2f} ms/sample, " + f"throughput {inf['throughput']:.0f}/s, " + f"peak VRAM {inf['peak_vram_mb']:.0f} MB " + f"(load {inf['load_vram_mb']:.0f} MB)" + ) + row = { + "variant": name, + "model_mb": size_mb, + "ms_per_sample": inf["ms_per_sample"], + "throughput_per_s": inf["throughput"], + "peak_vram_mb": inf["peak_vram_mb"], + "load_vram_mb": inf["load_vram_mb"], + } + for ref in BENCHMARKS: + m = evaluate_predictions(inf["cat_logits"], inf["spec_logits"], records, ref) + print( + f" vs {ref}: cat F1={m['cat_macro_f1']:.4f}, " + f"spec F1={m['spec_macro_f1']:.4f}, QWK={m['spec_qwk']:.4f}, " + f"cat ECE={m['cat_ece']:.4f}, spec ECE={m['spec_ece']:.4f}" + ) + row[f"{ref}_cat_f1"] = m["cat_macro_f1"] + row[f"{ref}_spec_f1"] = m["spec_macro_f1"] + row[f"{ref}_cat_mcc"] = m["cat_mcc"] + row[f"{ref}_spec_qwk"] = m["spec_qwk"] + row[f"{ref}_spec_mae"] = m["spec_mae"] + row[f"{ref}_cat_ece"] = m["cat_ece"] + row[f"{ref}_spec_ece"] = m["spec_ece"] + summary.append(row) + except Exception as e: + import traceback + traceback.print_exc() + summary.append({"variant": name, "error": f"{type(e).__name__}: {e}"}) + + summary_path = OUTPUT_DIR / "summary.json" + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2, default=str) + print(f"\nsummary → {summary_path}") + + print("\n" + "=" * 110) + print(f"{'variant':<18} {'MB':>9} {'ms/samp':>9} {'throughput':>11} " + f"{'VRAM MB':>9} {'cat F1':>9} {'spec F1':>9} {'spec QWK':>9}") + print("-" * 110) + for r in summary: + if "error" in r: + print(f"{r['variant']:<18} ERROR: {r['error']}") + continue + print( + f"{r['variant']:<18} {r['model_mb']:>9.1f} {r['ms_per_sample']:>9.2f} " + f"{r['throughput_per_s']:>11.0f} {r['peak_vram_mb']:>9.0f} " + f"{r['GPT-5.4_cat_f1']:>9.4f} {r['GPT-5.4_spec_f1']:>9.4f} " + f"{r['GPT-5.4_spec_qwk']:>9.4f}" + ) + print("=" * 110) + + +if __name__ == "__main__": + main() diff --git a/python/scripts/quantize_sweep.py b/python/scripts/quantize_sweep.py new file mode 100644 index 0000000..62fda6b --- /dev/null +++ b/python/scripts/quantize_sweep.py @@ -0,0 +1,491 @@ +"""Quantization sweep for the iter1-independent ModernBERT-large checkpoint. + +Loads the trained DualHeadModernBERT, applies a series of quantization +schemes to the *encoder* (heads kept in their native dtype), and re-runs +holdout evaluation against the GPT-5.4 / Opus-4.6 proxy gold. + +For each variant we record: + - cat / spec macro F1, per-class F1, QWK, MAE, ECE + - peak VRAM (encoder forward) + - latency (ms/sample, batch=64) and throughput + - encoder parameter footprint in MB + - delta vs bf16 baseline + +Variants: + fp32, bf16 (baseline), fp16, + torchao int8 weight-only, + torchao int8 dynamic-act + int8 weight, + torchao int4 weight-only (group=128), + bitsandbytes LLM.int8 (8-bit), + bitsandbytes nf4 (4-bit, double-quant, bf16 compute). + +Heads (category linear, attention pooler, independent threshold MLPs) +stay in bf16 — they sit on a 1024-dim representation and account for +< 0.3% of params, so quantizing them buys nothing and risks the threshold +margins which already drive most of the spec error budget. + +Usage: + bun run py:quant # via package.json wrapper + # or directly: + cd python && uv run scripts/quantize_sweep.py +""" + +from __future__ import annotations + +import gc +import json +import sys +import time +import traceback +from dataclasses import dataclass, field +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from safetensors.torch import load_file +from transformers import AutoModel, AutoTokenizer + +# Make `src` importable when run as a script +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from src.finetune.data import CAT2ID, CATEGORIES, NUM_CATEGORIES, NUM_SPECIFICITY # noqa: E402 +from src.finetune.eval import ( # noqa: E402 + SPEC_LABELS, + compute_all_metrics, + load_holdout_data, +) +from src.finetune.model import DualHeadModernBERT, ordinal_predict # noqa: E402 + +REPO = ROOT.parent +CHECKPOINT = REPO / "checkpoints/finetune/iter1-independent/final" +PARAGRAPHS = REPO / "data/paragraphs/paragraphs-clean.patched.jsonl" +HOLDOUT = REPO / "data/gold/v2-holdout-ids.json" +BENCHMARKS = { + "GPT-5.4": REPO / "data/annotations/v2-bench/gpt-5.4.jsonl", + "Opus-4.6": REPO / "data/annotations/v2-bench/opus-4.6.jsonl", +} +OUTPUT_DIR = REPO / "results/eval/quant" +BATCH_SIZE = 64 +MAX_SEQ = 512 +WARMUP_BATCHES = 5 + + +# ────────────────────────────────────────────────────────────────────── +# Model loading +# ────────────────────────────────────────────────────────────────────── + +def _build_model(dtype: torch.dtype, attn_impl: str = "sdpa") -> tuple[DualHeadModernBERT, AutoTokenizer]: + """Construct DualHeadModernBERT and load trained weights at the requested dtype.""" + tokenizer = AutoTokenizer.from_pretrained(str(CHECKPOINT)) + backbone = AutoModel.from_pretrained( + "answerdotai/ModernBERT-large", + trust_remote_code=True, + attn_implementation=attn_impl, + dtype=dtype, + ) + model = DualHeadModernBERT( + backbone=backbone, + hidden_size=backbone.config.hidden_size, + num_categories=NUM_CATEGORIES, + num_specificity=NUM_SPECIFICITY, + specificity_head_type="independent", + spec_mlp_dim=256, + pooling="attention", + ) + state = load_file(str(CHECKPOINT / "model.safetensors")) + model.load_state_dict(state, strict=False) + model = model.to(dtype) + model.eval() + return model, tokenizer + + +def _try_flash_attn() -> str: + try: + import flash_attn # noqa: F401 + return "flash_attention_2" + except ImportError: + return "sdpa" + + +# ────────────────────────────────────────────────────────────────────── +# Quantization variants +# ────────────────────────────────────────────────────────────────────── + +def variant_native(dtype: torch.dtype, attn: str | None = None): + def _build(): + impl = attn or _try_flash_attn() + # bf16/fp16 supported by flash-attn; fp32 must use sdpa + if dtype == torch.float32: + impl = "sdpa" + model, tok = _build_model(dtype, attn_impl=impl) + return model.cuda(), tok + return _build + + +def variant_torchao(config_factory): + def _build(): + from torchao.quantization import quantize_ + # torchao expects bf16 master weights + model, tok = _build_model(torch.bfloat16, attn_impl=_try_flash_attn()) + model = model.cuda() + # Quantize encoder linears only (skip heads + attention pooler) + quantize_(model.backbone, config_factory()) + return model, tok + return _build + + +def _swap_bnb_linear( + module: nn.Module, + mode: str, + compute_dtype=torch.bfloat16, + compress_statistics: bool = True, +) -> int: + """Recursively replace nn.Linear with bnb 8-bit / 4-bit equivalents. + + Returns number of layers swapped. Copies weights from the original + module so the trained checkpoint is preserved. + """ + import bitsandbytes as bnb + + swapped = 0 + for name, child in list(module.named_children()): + if isinstance(child, nn.Linear): + in_f, out_f = child.in_features, child.out_features + has_bias = child.bias is not None + if mode == "int8": + new = bnb.nn.Linear8bitLt( + in_f, out_f, bias=has_bias, + has_fp16_weights=False, threshold=6.0, + ) + new.weight = bnb.nn.Int8Params( + child.weight.data.clone(), + requires_grad=False, + has_fp16_weights=False, + ) + if has_bias: + new.bias = nn.Parameter(child.bias.data.clone()) + elif mode in ("nf4", "fp4"): + new = bnb.nn.Linear4bit( + in_f, out_f, bias=has_bias, + compute_dtype=compute_dtype, + quant_type=mode, + compress_statistics=compress_statistics, + quant_storage=torch.uint8, + device="cuda", + ) + w = child.weight.data.detach().to(torch.float32).clone() + new.weight = bnb.nn.Params4bit( + w, requires_grad=False, quant_type=mode, + compress_statistics=compress_statistics, module=new, + ).cuda() + if has_bias: + new.bias = nn.Parameter( + child.bias.data.detach().to(compute_dtype).clone().cuda() + ) + else: + raise ValueError(mode) + new = new.cuda() + setattr(module, name, new) + swapped += 1 + else: + swapped += _swap_bnb_linear(child, mode, compute_dtype) + return swapped + + +def variant_bnb(mode: str, compress_statistics: bool = True): + def _build(): + model, tok = _build_model(torch.bfloat16, attn_impl="sdpa") + model = model.cuda() + n = _swap_bnb_linear( + model.backbone, mode, compress_statistics=compress_statistics, + ) + print(f" bnb {mode} (cs={compress_statistics}): swapped {n} linears") + return model, tok + return _build + + +# ────────────────────────────────────────────────────────────────────── +# Inference + measurement +# ────────────────────────────────────────────────────────────────────── + +def _encoder_param_bytes(model: DualHeadModernBERT) -> int: + """Sum bytes of every parameter / buffer inside the encoder backbone. + + Handles bnb Int8Params (int8 storage) and Params4bit (uint8 packed) + correctly because element_size() reflects the storage dtype. + """ + total = 0 + seen = set() + for p in list(model.backbone.parameters()) + list(model.backbone.buffers()): + if id(p) in seen: + continue + seen.add(id(p)) + total += p.numel() * p.element_size() + return total + + +@torch.no_grad() +def run_inference(model, tokenizer, texts: list[str]) -> dict: + device = next(model.parameters()).device + cat_logits_list = [] + spec_logits_list = [] + + # Warmup + warm_batch = tokenizer( + texts[: BATCH_SIZE], truncation=True, max_length=MAX_SEQ, + padding="longest", return_tensors="pt", + ).to(device) + for _ in range(WARMUP_BATCHES): + _ = model(input_ids=warm_batch["input_ids"], attention_mask=warm_batch["attention_mask"]) + torch.cuda.synchronize() + torch.cuda.reset_peak_memory_stats() + + total_time = 0.0 + for i in range(0, len(texts), BATCH_SIZE): + batch = texts[i : i + BATCH_SIZE] + enc = tokenizer( + batch, truncation=True, max_length=MAX_SEQ, + padding="longest", return_tensors="pt", + ).to(device) + torch.cuda.synchronize() + t0 = time.perf_counter() + out = model(input_ids=enc["input_ids"], attention_mask=enc["attention_mask"]) + torch.cuda.synchronize() + total_time += time.perf_counter() - t0 + cat_logits_list.append(out["category_logits"].float().cpu()) + spec_logits_list.append(out["specificity_logits"].float().cpu()) + + peak_vram = torch.cuda.max_memory_allocated() + cat_logits = torch.cat(cat_logits_list) + spec_logits = torch.cat(spec_logits_list) + return { + "cat_logits": cat_logits, + "spec_logits": spec_logits, + "total_time_s": total_time, + "ms_per_sample": (total_time / len(texts)) * 1000, + "throughput": len(texts) / total_time, + "peak_vram_mb": peak_vram / (1024 ** 2), + "num_samples": len(texts), + } + + +def evaluate_predictions( + cat_logits: torch.Tensor, + spec_logits: torch.Tensor, + records: list[dict], + ref_name: str, +) -> dict: + cat_probs_all = F.softmax(cat_logits, dim=1).numpy() + cat_preds_all = cat_logits.argmax(dim=1).numpy() + spec_preds_all = ordinal_predict(spec_logits).numpy() + # ordinal → class probs + sp = torch.sigmoid(spec_logits) + K = sp.shape[1] + 1 + spec_probs_all = torch.zeros(sp.shape[0], K) + spec_probs_all[:, 0] = 1 - sp[:, 0] + for k in range(1, K - 1): + spec_probs_all[:, k] = sp[:, k - 1] - sp[:, k] + spec_probs_all[:, -1] = sp[:, -1] + spec_probs_all = spec_probs_all.clamp(min=0) + spec_probs_all = spec_probs_all / spec_probs_all.sum(dim=1, keepdim=True) + spec_probs_all = spec_probs_all.numpy() + + cat_labels, spec_labels = [], [] + cat_p, spec_p, cat_pr, spec_pr = [], [], [], [] + for i, rec in enumerate(records): + b = rec["benchmark_labels"].get(ref_name) + if b is None: + continue + cat_labels.append(CAT2ID[b["category"]]) + spec_labels.append(b["specificity"] - 1) + cat_p.append(cat_preds_all[i]) + spec_p.append(spec_preds_all[i]) + cat_pr.append(cat_probs_all[i]) + spec_pr.append(spec_probs_all[i]) + + cat_m = compute_all_metrics( + np.array(cat_p), np.array(cat_labels), np.array(cat_pr), + CATEGORIES, "cat", is_ordinal=False, + ) + spec_m = compute_all_metrics( + np.array(spec_p), np.array(spec_labels), np.array(spec_pr), + SPEC_LABELS, "spec", is_ordinal=True, + ) + return {**cat_m, **spec_m} + + +# ────────────────────────────────────────────────────────────────────── +# Variant registry +# ────────────────────────────────────────────────────────────────────── + +@dataclass +class Variant: + name: str + description: str + builder: callable + skip_reason: str | None = None + + +def build_variants() -> list[Variant]: + from torchao.quantization import ( + Int4WeightOnlyConfig, + Int8DynamicActivationInt8WeightConfig, + Int8WeightOnlyConfig, + ) + + return [ + Variant("fp32", "Float32 encoder + heads", variant_native(torch.float32, attn="sdpa")), + Variant("bf16", "BFloat16 baseline (matches eval pipeline)", variant_native(torch.bfloat16)), + Variant("fp16", "Float16 encoder + heads", variant_native(torch.float16)), + Variant( + "torchao-int8-wo", + "torchao Int8 weight-only on encoder linears", + variant_torchao(lambda: Int8WeightOnlyConfig()), + ), + Variant( + "torchao-int8-dyn", + "torchao Int8 dynamic activation + Int8 weight on encoder", + variant_torchao(lambda: Int8DynamicActivationInt8WeightConfig()), + ), + Variant( + "torchao-int4-wo", + "torchao Int4 weight-only (group=128) on encoder", + variant_torchao(lambda: Int4WeightOnlyConfig(group_size=128)), + ), + Variant("bnb-int8", "bitsandbytes LLM.int8 on encoder linears", variant_bnb("int8")), + Variant("bnb-nf4", "bitsandbytes NF4 4-bit (double-quant, bf16 compute)", variant_bnb("nf4", compress_statistics=True)), + Variant("bnb-nf4-nodq", "bitsandbytes NF4 4-bit (no double-quant)", variant_bnb("nf4", compress_statistics=False)), + Variant("bnb-fp4", "bitsandbytes FP4 4-bit (no double-quant)", variant_bnb("fp4", compress_statistics=False)), + ] + + +# ────────────────────────────────────────────────────────────────────── +# Driver +# ────────────────────────────────────────────────────────────────────── + +def free(): + gc.collect() + torch.cuda.empty_cache() + torch.cuda.synchronize() + + +def main(): + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + print(f"Loading holdout from {HOLDOUT}") + records = load_holdout_data( + str(PARAGRAPHS), str(HOLDOUT), {k: str(v) for k, v in BENCHMARKS.items()}, + ) + texts = [r["text"] for r in records] + print(f" {len(records)} holdout paragraphs loaded") + + variants = build_variants() + summary = [] + + for v in variants: + print(f"\n══ {v.name} — {v.description}") + free() + try: + t0 = time.perf_counter() + model, tokenizer = v.builder() + build_s = time.perf_counter() - t0 + enc_bytes = _encoder_param_bytes(model) + print(f" encoder footprint: {enc_bytes / 1e6:.1f} MB (build {build_s:.1f}s)") + inf = run_inference(model, tokenizer, texts) + print( + f" latency {inf['ms_per_sample']:.2f} ms/sample, " + f"throughput {inf['throughput']:.0f}/s, " + f"peak VRAM {inf['peak_vram_mb']:.0f} MB" + ) + + metrics_per_ref = {} + for ref in BENCHMARKS: + m = evaluate_predictions(inf["cat_logits"], inf["spec_logits"], records, ref) + metrics_per_ref[ref] = m + print( + f" vs {ref}: cat F1={m['cat_macro_f1']:.4f}, " + f"spec F1={m['spec_macro_f1']:.4f}, QWK={m['spec_qwk']:.4f}, " + f"cat ECE={m['cat_ece']:.4f}, spec ECE={m['spec_ece']:.4f}" + ) + + row = { + "variant": v.name, + "description": v.description, + "encoder_mb": enc_bytes / 1e6, + "ms_per_sample": inf["ms_per_sample"], + "throughput_per_s": inf["throughput"], + "peak_vram_mb": inf["peak_vram_mb"], + "build_s": build_s, + } + for ref, m in metrics_per_ref.items(): + row[f"{ref}_cat_f1"] = m["cat_macro_f1"] + row[f"{ref}_spec_f1"] = m["spec_macro_f1"] + row[f"{ref}_cat_mcc"] = m["cat_mcc"] + row[f"{ref}_spec_qwk"] = m["spec_qwk"] + row[f"{ref}_spec_mae"] = m["spec_mae"] + row[f"{ref}_cat_ece"] = m["cat_ece"] + row[f"{ref}_spec_ece"] = m["spec_ece"] + # per-spec-level F1 + for s in SPEC_LABELS: + short = s.replace(" ", "").replace(":", "")[:8] + row[f"{ref}_spec_f1_{short}"] = m.get(f"spec_f1_{short}", 0) + summary.append(row) + + # Per-variant detailed metrics dump + vdir = OUTPUT_DIR / v.name + vdir.mkdir(parents=True, exist_ok=True) + with open(vdir / "metrics.json", "w") as f: + ser = {} + for ref, m in metrics_per_ref.items(): + ser[ref] = { + k: (v_ if not isinstance(v_, np.ndarray) else v_.tolist()) + for k, v_ in m.items() + if isinstance(v_, (int, float, str, list, bool)) + } + ser["_runtime"] = { + "encoder_mb": enc_bytes / 1e6, + "ms_per_sample": inf["ms_per_sample"], + "throughput_per_s": inf["throughput"], + "peak_vram_mb": inf["peak_vram_mb"], + "build_s": build_s, + } + json.dump(ser, f, indent=2, default=str) + + del model, tokenizer, inf + except Exception as e: + print(f" FAILED: {type(e).__name__}: {e}") + traceback.print_exc() + summary.append({ + "variant": v.name, + "description": v.description, + "error": f"{type(e).__name__}: {e}", + }) + free() + + # Write summary + summary_path = OUTPUT_DIR / "summary.json" + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2, default=str) + print(f"\nSummary written to {summary_path}") + + # Print compact table + print("\n" + "=" * 110) + print(f"{'variant':<18} {'enc MB':>9} {'ms/samp':>9} {'throughput':>11} " + f"{'VRAM MB':>9} {'cat F1':>9} {'spec F1':>9} {'spec QWK':>9}") + print("-" * 110) + for r in summary: + if "error" in r: + print(f"{r['variant']:<18} ERROR: {r['error']}") + continue + print( + f"{r['variant']:<18} {r['encoder_mb']:>9.1f} {r['ms_per_sample']:>9.2f} " + f"{r['throughput_per_s']:>11.0f} {r['peak_vram_mb']:>9.0f} " + f"{r['GPT-5.4_cat_f1']:>9.4f} {r['GPT-5.4_spec_f1']:>9.4f} {r['GPT-5.4_spec_qwk']:>9.4f}" + ) + print("=" * 110) + + +if __name__ == "__main__": + main() diff --git a/results/eval/onnx/REPORT.md b/results/eval/onnx/REPORT.md new file mode 100644 index 0000000..68cd956 --- /dev/null +++ b/results/eval/onnx/REPORT.md @@ -0,0 +1,117 @@ +# ONNX Export + Eval — iter1-independent ModernBERT-large + +**Date:** 2026-04-07 +**Checkpoint:** `checkpoints/finetune/iter1-independent/final/` +**Hardware:** RTX 3090 (sm_8.6, 24 GB), onnxruntime-gpu 1.24.4, onnx 1.21 +**Driver:** `python/scripts/onnx_export_eval.py` (`bun run py:onnx`) +**Eval set:** 1,200-paragraph v2 holdout, proxy gold = GPT-5.4 + Opus-4.6 + +## TL;DR + +ONNX export of this model is *technically* possible but the path is full of +dead ends. The dynamo exporter produces a graph with 56 Memcpy nodes that +makes ORT 8× slower than native torch and 4× more VRAM-heavy; the legacy +TorchScript exporter produces a clean graph that's actually 22% faster than +torch fp32 (kernel fusion); fp16 conversion breaks on the rotary embedding; +dynamic int8 quantization via ORT silently falls back to CPU and drops +~0.5 macro F1. **Net: torchao int8-wo from the earlier sweep is still the +right int8 deployment path. ONNX is not.** + +## What we tried + +| variant | exporter | size MB | ms/sample | VRAM MB | cat F1 | spec F1 | result | +|--------------------|----------------------|--------:|----------:|--------:|-------:|--------:|-----------------| +| onnx-fp32 (dynamo) | torch.onnx (dynamo) | 1583 | 42.92 | 15388 | 0.9337 | 0.8943 | works but unusable | +| onnx-int8 (dynamo) | dynamo + ORT int8 | 1580 | 42.82 | 15398 | 0.9337 | 0.8943 | no-op (no quant) | +| **onnx-fp32 (legacy)** | torch.onnx (TorchScript) | 1583 | **12.70** | 8228 | 0.9337 | 0.8952 | **clean graph, faster than torch** | +| onnx-fp16 (legacy) | onnxconverter_common | 754 | err | err | err | err | rotary type unify | +| onnx-int8 (legacy) | ORT quantize_dynamic | 527 | 95.91 | ~CPU | 0.3972 | 0.3364 | CPU fallback + accuracy collapse | + +(All entries above were re-run from scratch — fp32 timing improved 3× moving +from dynamo to legacy export.) + +## Six things broke along the way (workarounds in the script) + +1. **Dynamo exporter optimizer crashes.** `torch.onnx.export(..., dynamo=True)` + succeeds at translation but the post-translation `InlinePass` optimizer + trips on `onnx_ir`. Workaround: `optimize=False`. +2. **Dynamo-exported graph is unusable on CUDA EP.** ORT inserts 56 Memcpy + nodes between layers because dynamo emits scalar tensors with CPU + placement metadata. Result: 42.9 ms/sample (8× torch fp32) and 15.4 GB + VRAM (4.4× torch fp32). The legacy exporter only inserts 1 Memcpy. +3. **`op_types_to_quantize=['MatMul']` quantizes nothing on the dynamo + graph.** Dynamo emits encoder linears as `Gemm` nodes, not `MatMul`. + Fix: `op_types_to_quantize=['MatMul', 'Gemm']`. +4. **Both ORT shape-inference paths choke on ModernBERT.** Symbolic + inference asserts in `_infer_Range` (rotary embedding limit input is + not a scalar); the C++ inference raises a (1024)/(7) dim mismatch on + the category head Gemm. The `skip_*` flags on `quant_pre_process` are + ignored, and `ONNXQuantizer.__init__` calls + `save_and_reload_model_with_shape_infer` unconditionally. Workaround: + monkey-patch `quant_utils.save_and_reload_model_with_shape_infer` + *and* the cached binding in `onnx_quantizer` to a no-op, then pass + `extra_options={'DefaultTensorType': onnx.TensorProto.FLOAT}` so the + quantizer can still type the head MatMul. +5. **fp16 conversion via `onnxconverter_common` breaks on rotary + embeddings.** Two distinct failure modes seen across exports: + `Type Error: Type (tensor(float16)) of output arg (val_58) of node + (node_Expand_56) does not match expected type (tensor(float))` (dynamo + graph) and `Type parameter (T) of Optype (Mul) bound to different types + (tensor(float) and tensor(float16) in node + (/model/backbone/rotary_emb_1/Mul_2)` (legacy graph). The converter + leaves the `inv_freq` buffer in fp32 and the surrounding Mul/Expand + ops then can't unify their type parameter. Could be patched with an + `op_block_list` for the rotary subgraph, but the cost/value isn't + there given the dynamic int8 result below. +6. **Dynamic int8 via ORT silently falls back to CPU.** The quantizer + replaces Gemm/MatMul with `MatMulInteger` + `DynamicQuantizeLinear`, + neither of which has CUDA kernels in onnxruntime-gpu 1.24. Session + creation succeeds with CUDAExecutionProvider but routes the + quantized ops to the CPU EP — observable from the `load_vram_mb` + collapsing from 2074 MB (fp32) to 266 MB (int8) and latency exploding + to 95.9 ms/sample. Per-channel int8 weights also drop accuracy from + 0.934 → 0.397 on category and 0.895 → 0.336 on spec, further + confirming the kernel path is wrong (not just slow). + +## What actually works + +**onnx-fp32 via the legacy TorchScript exporter** is the one clean win: +12.70 ms/sample vs 16.29 for torch fp32 — a **22% latency improvement +from ORT's LayerNorm/Gelu/MatMul fusion** at bit-identical accuracy. VRAM +is 8228 MB vs 3504 MB for torch fp32 (the ORT session allocates a separate +~5 GB workspace), so the speedup costs you ~2.3× memory. On a single +3090 batch=64 inference run that's a fair trade. + +But this is fp32 — bf16 torch + flash-attn-2 is *still* the strict winner +at 5.52 ms / 1741 MB (Phase 10.8 result). ORT can't run bf16 natively, and +fp16 conversion is broken. So even the working ONNX path is dominated by +what we already ship. + +## Recommendation + +**Don't use ONNX for this model on this hardware.** The torchao int8-wo +result from the quantization sweep (5.52 → 6.08 ms, 1741 → 1416 MB peak +VRAM, F1 within ±0.001) covers the "smaller deployment" use case more +cleanly than anything ONNX can offer here, and bf16 + flash-attn-2 +remains the production default. + +ONNX *would* be worth revisiting in any of these scenarios: +- **CPU-only deployment** — fp32 ONNX runs fine on CPUExecutionProvider + and ORT's int8 dynamic path is actually designed for this case. Worth + benchmarking if a CPU serving target ever shows up. +- **Cross-runtime portability** — TensorRT, OpenVINO, mobile runtimes. + These would each need their own export validation pass. +- **Static int8 with calibration** — `quantize_static` with a calibration + dataset can avoid the dynamic-quant CPU fallback path. Would need a + ModernBERT-friendly calibration loop and probably an `op_block_list` + to keep the rotary in fp32. Real engineering work, not a one-shot. + +## Reproduce + +```bash +bun run py:onnx +# writes to: +# results/eval/onnx/models/{model_fp32,model_fp16,model_int8_dyn}.onnx[.data] +# results/eval/onnx/summary.json +# results/eval/onnx/REPORT.md (this file) +``` diff --git a/results/eval/onnx/summary.json b/results/eval/onnx/summary.json new file mode 100644 index 0000000..6382ceb --- /dev/null +++ b/results/eval/onnx/summary.json @@ -0,0 +1,50 @@ +[ + { + "variant": "onnx-fp32", + "model_mb": 1583.256294, + "ms_per_sample": 12.703279327494482, + "throughput_per_s": 78.71983085781946, + "peak_vram_mb": 8228.0, + "load_vram_mb": 2074.0, + "GPT-5.4_cat_f1": 0.9336741161693523, + "GPT-5.4_spec_f1": 0.8951731906425856, + "GPT-5.4_cat_mcc": 0.9226990724708704, + "GPT-5.4_spec_qwk": 0.9324447137231142, + "GPT-5.4_spec_mae": 0.1175, + "GPT-5.4_cat_ece": 0.05386760701735813, + "GPT-5.4_spec_ece": 0.07004604930679002, + "Opus-4.6_cat_f1": 0.922684387023173, + "Opus-4.6_spec_f1": 0.8833694419146193, + "Opus-4.6_cat_mcc": 0.909266938399113, + "Opus-4.6_spec_qwk": 0.9227008860372746, + "Opus-4.6_spec_mae": 0.13583333333333333, + "Opus-4.6_cat_ece": 0.06540583113829297, + "Opus-4.6_spec_ece": 0.08156729981303217 + }, + { + "variant": "onnx-fp16", + "error": "Fail: [ONNXRuntimeError] : 1 : FAIL : Load model from /home/joey/Documents/sec-cyBERT/results/eval/onnx/models/model_fp16.onnx failed:Type Error: Type parameter (T) of Optype (Mul) bound to different types (tensor(float) and tensor(float16) in node (/model/backbone/rotary_emb_1/Mul_2)." + }, + { + "variant": "onnx-int8-dyn", + "model_mb": 553.381903, + "ms_per_sample": 95.90791940659983, + "throughput_per_s": 10.42666764316426, + "peak_vram_mb": 7188.0, + "load_vram_mb": 266.0, + "GPT-5.4_cat_f1": 0.3971686880679718, + "GPT-5.4_spec_f1": 0.3364003775746365, + "GPT-5.4_cat_mcc": 0.3459776856134484, + "GPT-5.4_spec_qwk": 0.4412945592628398, + "GPT-5.4_spec_mae": 0.7225, + "GPT-5.4_cat_ece": 0.23434762333830195, + "GPT-5.4_spec_ece": 0.35458642202119034, + "Opus-4.6_cat_f1": 0.4150626036637055, + "Opus-4.6_spec_f1": 0.3204015536108683, + "Opus-4.6_cat_mcc": 0.3663561834842673, + "Opus-4.6_spec_qwk": 0.43979676755288855, + "Opus-4.6_spec_mae": 0.7375, + "Opus-4.6_cat_ece": 0.2126809566716353, + "Opus-4.6_spec_ece": 0.37541975535452365 + } +] \ No newline at end of file diff --git a/results/eval/quant/REPORT.md b/results/eval/quant/REPORT.md new file mode 100644 index 0000000..7b0b179 --- /dev/null +++ b/results/eval/quant/REPORT.md @@ -0,0 +1,163 @@ +# Quantization Sweep — iter1-independent ModernBERT-large + +**Date:** 2026-04-07 +**Checkpoint:** `checkpoints/finetune/iter1-independent/final/` +**Hardware:** RTX 3090 (sm_8.6, 24 GB) +**Eval set:** 1,200-paragraph v2 holdout, proxy gold = GPT-5.4 + Opus-4.6 +**Driver:** `python/scripts/quantize_sweep.py` (run via `bun run py:quant`) + +## Setup + +For each variant the *encoder* (ModernBERT-large backbone, 28 layers, 112 +nn.Linear modules) is converted to the target precision/scheme, while the +attention pooler and the dual heads (category linear + 3 independent +threshold MLPs) are kept in bf16. Heads are <0.3% of params and sit on +already-distilled 1024-d representations — quantizing them buys nothing and +risks the threshold margins that drive most of the spec error budget. + +For every variant we measure end-to-end inference on the full 1,200-paragraph +holdout at batch=64, max_seq=512, after 5 warmup batches: + +- **encoder_mb** — sum of `param.numel() * param.element_size()` over the + encoder. **Caveat:** for torchao tensor subclasses (`AffineQuantizedTensor`) + this reports the *outer* dtype (bf16) rather than the int8 storage, so the + 790 MB figure for the torchao rows is an over-estimate; real on-disk + storage is roughly half. The bnb 4-bit row (275 MB) is correct because + `Params4bit` reports `uint8` element_size. +- **ms/sample** — wall-clock per paragraph at batch=64 +- **peak VRAM** — `torch.cuda.max_memory_allocated()` over the timed run + (encoder fwd + activations) +- **F1 / QWK / ECE** — full eval pipeline reused from `src/finetune/eval.py` + +## Results + +| variant | enc MB | ms/samp | thru/s | VRAM MB | cat F1 (GPT) | spec F1 (GPT) | spec QWK | cat F1 (Opus) | spec F1 (Opus) | notes | +|--------------------|-------:|--------:|-------:|--------:|-------------:|--------------:|---------:|--------------:|---------------:|--------------------------------| +| fp32 | 1579 | 16.29 | 61 | 3504 | 0.9337 | 0.8943 | 0.9321 | 0.9227 | 0.8825 | sdpa (no flash-attn) | +| **bf16 (baseline)**| 790 | 5.52 | 181 | 1741 | 0.9337 | 0.8952 | 0.9324 | 0.9227 | 0.8834 | flash-attn-2 | +| fp16 | 790 | 5.54 | 181 | 1741 | 0.9337 | 0.8952 | 0.9324 | 0.9227 | 0.8834 | flash-attn-2 | +| **torchao int8-wo**| ~395* | 6.08 | 165 | 1416 | 0.9345 | 0.8941 | 0.9330 | 0.9235 | 0.8815 | weight-only int8 | +| torchao int8-dyn | ~395* | 9.67 | 103 | 1774 | 0.9336 | 0.8918 | 0.9315 | 0.9243 | 0.8827 | dyn act + int8 weight | +| torchao int4-wo | — | — | — | — | — | — | — | — | — | requires `mslk>=1.0.0` | +| bnb LLM.int8 | ~395* | 7.76 | 129 | 2135 | 0.9361 | 0.8986 | 0.9308 | 0.9235 | 0.8827 | mixed-precision outliers | +| bnb nf4 (DQ) | 275 | 5.86 | 171 | 1287 | 0.3537 | 0.2205 | 0.2423 | 0.3576 | 0.2075 | **collapsed** | +| bnb nf4 (no DQ) | 275 | 5.86 | 171 | 1287 | 0.3537 | 0.2205 | 0.2423 | 0.3576 | 0.2075 | **collapsed** | +| bnb fp4 (no DQ) | 275 | 5.87 | 170 | 1287 | 0.1629 | 0.2085 | 0.2326 | 0.1686 | 0.1978 | **collapsed harder** | + +\*torchao subclass tensors report bf16 element_size; true storage ~395 MB. + +Per-variant detail (per-class F1, MCC, AUC, confusion matrices, calibration +bins) is in `results/eval/quant/{variant}/metrics.json`. Aggregate row-level +data is in `results/eval/quant/summary.json`. + +## Findings + +### 1. bf16 is already the production sweet spot +Flash-attention-2 + bf16 gives **3.0× the throughput of fp32** (181 vs 61 +samples/sec) at **half the VRAM** (1.7 vs 3.5 GB) with bit-identical +accuracy. This is what we already train and serve at; the sweep simply +confirms there's no headroom in fp16/fp32 for this hardware. + +### 2. fp16 ≡ bf16 on Ampere +Identical latency, identical VRAM, identical F1. RTX 3090 has matched +bf16/fp16 throughput on tensor cores and the model has no overflow issues +in either format. Pick whichever the loader prefers. + +### 3. torchao int8 weight-only is the only quantization variant worth shipping +- **VRAM −19%** (1741 → 1416 MB) — meaningful for batched serving +- **F1 essentially unchanged** (cat +0.0008, spec −0.0011 vs bf16 — both + inside per-seed noise) +- **Latency +10%** (5.52 → 6.08 ms/sample) — the int8 weight is dequantized + to bf16 on the fly because RTX 3090 (sm_8.6) lacks the int8 tensor-core + matmul kernel paths torchao would otherwise use; on H100/A100/Ada this + same config would also be faster + +The accuracy delta is statistically nothing — well within the ±0.002 std we +observed across the 3-seed ensemble. **This is the variant we'd ship as the +"low-VRAM" deployment option.** + +### 4. torchao int8 dynamic activation: don't bother on this hardware +−43% throughput (5.52 → 9.67 ms/sample) and *more* peak VRAM than bf16 +(1774 vs 1741 MB) because the per-batch activation quantization adds work +without unlocking int8 tensor cores. Pure regression on Ampere. + +### 5. bnb LLM.int8: slowest int8, no accuracy upside +- **+41% latency** (5.52 → 7.76 ms/sample) due to mixed-precision outlier + handling +- **+23% VRAM** (1741 → 2135 MB) — outlier columns are kept in fp16 plus + scratch buffers +- **F1 +0.0024 cat, +0.0034 spec** — within noise; not a real win + +bnb LLM.int8 was designed for LLM-scale models where outlier features +dominate quant error; for an encoder of this size on a single 3090 it +just trades performance for nothing. + +### 6. All 4-bit variants collapse — ModernBERT-large is too quant-sensitive +Both nf4 (with and without double-quantization) and fp4 produce essentially +random predictions: + +| variant | cat F1 | spec F1 | spec ECE | +|---------|-------:|--------:|---------:| +| nf4 | 0.354 | 0.221 | 0.434 | +| fp4 | 0.163 | 0.209 | 0.443 | + +Per-layer dequantization is faithful — we verified that the dequantized +weight of one MLP Wi layer differs from the original by mean 0.005 / max +0.11 (sub-1% error). But the relative output drift on a single Linear is +already ~98% (mean), and that error compounds across 28 transformer blocks ++ GLU FFN paths until the [CLS]/pooled representation no longer carries +the discriminative signal. The category head essentially collapses to a +near-uniform prior (cat ECE 0.10 vs the 0.054 baseline) and the threshold +heads collapse onto L1 because all three thresholds emit similar logits. + +The fact that **DQ vs no-DQ are bit-identical** at this scale tells us the +nf4 weight indices are stable under absmax requantization (only ~5% of the +weight bytes change, all in the metadata block) — the catastrophe is +inherent to 4-bit weight precision on this architecture, not to a +quantization-config knob. + +This is a real noteworthy null for the paper: **naive post-training 4-bit +weight quantization is not viable for ModernBERT-large on this task**. +Recovering 4-bit would require either (a) QAT, (b) per-channel calibration +with a held-out activation distribution (GPTQ / AWQ-style), or (c) keeping +the GLU FFN in 8-bit and only 4-bit'ing attention projections. None of +these are reachable inside the remaining capstone time budget. + +### 7. torchao int4-wo: dependency hole +torchao 0.17 requires `mslk >= 1.0.0` for the new `Int4Tensor.from_hp` path. +Not installed in the lockfile and not worth chasing given the bnb 4-bit +collapse — even if the kernel ran cleanly we'd expect the same compounding +error pattern. + +## Recommendations + +| Use case | Variant | Why | +|-----------------------------------|--------------------|-------------------------------------------------------------| +| **Production / paper headline** | bf16 | Best of every dimension on this hardware | +| **Low-VRAM batch serving** | torchao int8-wo | −19% VRAM, accuracy intact, only 10% latency penalty | +| **Multi-GPU sharded serving** | bf16 | int8-wo's dequant overhead grows with replica count | +| **Embedded / 4-bit** | not viable | Needs QAT or AWQ-style calibration; future work | + +## Paper-worthy notes + +1. **Quantization story** — bf16 is already the sweet spot; torchao int8-wo + buys 19% VRAM with no accuracy cost; 4-bit fails. This adds another row + to the speed/cost table. +2. **Architecture-specific quant fragility** — ModernBERT-large's GLU FFN + amplifies per-layer weight error across 28 blocks. This is a noteworthy + counterpoint to the 4-bit-by-default LLM serving narrative and worth + one paragraph in the discussion section alongside the DAPT and + CORAL null results. +3. **Hardware caveat** — int8 latency results would invert on + Hopper/Ada/A100; the 3090 just doesn't have the matmul path. State the + sm_8.6 caveat in the table caption. + +## Reproduce + +```bash +# from repo root +bun run py:quant +# writes to results/eval/quant/{summary.json, REPORT.md, /metrics.json} +``` + +Run time: ~5 minutes total (most spent in fp32 + torchao build steps). diff --git a/results/eval/quant/bf16/metrics.json b/results/eval/quant/bf16/metrics.json new file mode 100644 index 0000000..728445e --- /dev/null +++ b/results/eval/quant/bf16/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.9336741161693523, + "cat_weighted_f1": 0.9343162998643407, + "cat_macro_precision": 0.93189297179766, + "cat_macro_recall": 0.9377918652022429, + "cat_mcc": 0.9226990724708704, + "cat_auc": 0.991991833154947, + "cat_ece": 0.053848127176364245, + "cat_confusion_matrix": [ + [ + 225, + 0, + 3, + 0, + 2, + 0, + 0 + ], + [ + 0, + 85, + 0, + 0, + 2, + 1, + 0 + ], + [ + 2, + 0, + 144, + 1, + 3, + 0, + 0 + ], + [ + 0, + 0, + 3, + 131, + 0, + 2, + 0 + ], + [ + 6, + 1, + 5, + 19, + 164, + 1, + 2 + ], + [ + 0, + 3, + 1, + 8, + 2, + 207, + 0 + ], + [ + 0, + 0, + 0, + 0, + 12, + 0, + 165 + ] + ], + "cat_f1_BoardGov": 0.9719222462203023, + "cat_prec_BoardGov": 0.9656652360515021, + "cat_recall_BoardGov": 0.9782608695652174, + "cat_f1_Incident": 0.96045197740113, + "cat_prec_Incident": 0.9550561797752809, + "cat_recall_Incident": 0.9659090909090909, + "cat_f1_Manageme": 0.9411764705882353, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.96, + "cat_f1_NoneOthe": 0.888135593220339, + "cat_prec_NoneOthe": 0.8238993710691824, + "cat_recall_NoneOthe": 0.9632352941176471, + "cat_f1_RiskMana": 0.856396866840731, + "cat_prec_RiskMana": 0.8864864864864865, + "cat_recall_RiskMana": 0.8282828282828283, + "cat_f1_Strategy": 0.9583333333333334, + "cat_prec_Strategy": 0.981042654028436, + "cat_recall_Strategy": 0.9366515837104072, + "cat_f1_Third-Pa": 0.9593023255813954, + "cat_prec_Third-Pa": 0.9880239520958084, + "cat_recall_Third-Pa": 0.9322033898305084, + "cat_kripp_alpha": 0.9223591517560865, + "spec_macro_f1": 0.8951731906425856, + "spec_weighted_f1": 0.9121524819510628, + "spec_macro_precision": 0.8980417155129858, + "spec_macro_recall": 0.8930560580782194, + "spec_mcc": 0.866381831963237, + "spec_auc": 0.981666223606385, + "spec_ece": 0.07135417198141418, + "spec_confusion_matrix": [ + [ + 580, + 23, + 12, + 3 + ], + [ + 29, + 130, + 7, + 2 + ], + [ + 11, + 4, + 190, + 2 + ], + [ + 2, + 1, + 9, + 195 + ] + ], + "spec_f1_L1Generi": 0.9354838709677419, + "spec_prec_L1Generi": 0.932475884244373, + "spec_recall_L1Generi": 0.9385113268608414, + "spec_f1_L2Domain": 0.7975460122699386, + "spec_prec_L2Domain": 0.8227848101265823, + "spec_recall_L2Domain": 0.7738095238095238, + "spec_f1_L3Firm-S": 0.8941176470588236, + "spec_prec_L3Firm-S": 0.8715596330275229, + "spec_recall_L3Firm-S": 0.9178743961352657, + "spec_f1_L4Quanti": 0.9535452322738386, + "spec_prec_L4Quanti": 0.9653465346534653, + "spec_recall_L4Quanti": 0.9420289855072463, + "spec_qwk": 0.9324447137231142, + "spec_mae": 0.1175, + "spec_kripp_alpha": 0.917725722448833 + }, + "Opus-4.6": { + "cat_macro_f1": 0.922684387023173, + "cat_weighted_f1": 0.9216414809666168, + "cat_macro_precision": 0.9177680939029339, + "cat_macro_recall": 0.9316060900094703, + "cat_mcc": 0.909266938399113, + "cat_auc": 0.9939660707189948, + "cat_ece": 0.06551479384303091, + "cat_confusion_matrix": [ + [ + 211, + 0, + 1, + 1, + 1, + 0, + 0 + ], + [ + 0, + 78, + 0, + 0, + 1, + 0, + 0 + ], + [ + 8, + 0, + 144, + 1, + 4, + 0, + 1 + ], + [ + 0, + 0, + 1, + 138, + 1, + 1, + 0 + ], + [ + 13, + 0, + 9, + 14, + 169, + 1, + 7 + ], + [ + 1, + 11, + 1, + 4, + 3, + 208, + 0 + ], + [ + 0, + 0, + 0, + 1, + 6, + 1, + 159 + ] + ], + "cat_f1_BoardGov": 0.9440715883668904, + "cat_prec_BoardGov": 0.9055793991416309, + "cat_recall_BoardGov": 0.985981308411215, + "cat_f1_Incident": 0.9285714285714286, + "cat_prec_Incident": 0.8764044943820225, + "cat_recall_Incident": 0.9873417721518988, + "cat_f1_Manageme": 0.9171974522292994, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.9113924050632911, + "cat_f1_NoneOthe": 0.92, + "cat_prec_NoneOthe": 0.8679245283018868, + "cat_recall_NoneOthe": 0.9787234042553191, + "cat_f1_RiskMana": 0.8492462311557789, + "cat_prec_RiskMana": 0.9135135135135135, + "cat_recall_RiskMana": 0.7934272300469484, + "cat_f1_Strategy": 0.9476082004555809, + "cat_prec_Strategy": 0.985781990521327, + "cat_recall_Strategy": 0.9122807017543859, + "cat_f1_Third-Pa": 0.9520958083832335, + "cat_prec_Third-Pa": 0.9520958083832335, + "cat_recall_Third-Pa": 0.9520958083832335, + "cat_kripp_alpha": 0.908575631724203, + "spec_macro_f1": 0.8833694419146193, + "spec_weighted_f1": 0.9004034318676798, + "spec_macro_precision": 0.8858989636247611, + "spec_macro_recall": 0.8854684685880032, + "spec_mcc": 0.8500778641433316, + "spec_auc": 0.9736633898988131, + "spec_ece": 0.08248284702499709, + "spec_confusion_matrix": [ + [ + 567, + 30, + 7, + 1 + ], + [ + 22, + 118, + 3, + 2 + ], + [ + 33, + 10, + 207, + 10 + ], + [ + 0, + 0, + 1, + 189 + ] + ], + "spec_f1_L1Generi": 0.9242053789731052, + "spec_prec_L1Generi": 0.9115755627009646, + "spec_recall_L1Generi": 0.9371900826446281, + "spec_f1_L2Domain": 0.7788778877887789, + "spec_prec_L2Domain": 0.7468354430379747, + "spec_recall_L2Domain": 0.8137931034482758, + "spec_f1_L3Firm-S": 0.8661087866108786, + "spec_prec_L3Firm-S": 0.9495412844036697, + "spec_recall_L3Firm-S": 0.7961538461538461, + "spec_f1_L4Quanti": 0.9642857142857143, + "spec_prec_L4Quanti": 0.9356435643564357, + "spec_recall_L4Quanti": 0.9947368421052631, + "spec_qwk": 0.9227008860372746, + "spec_mae": 0.13583333333333333, + "spec_kripp_alpha": 0.9065248741550552 + }, + "_runtime": { + "encoder_mb": 789.563648, + "ms_per_sample": 5.516677870764397, + "throughput_per_s": 181.26851402716375, + "peak_vram_mb": 1740.83837890625, + "build_s": 0.48778308398323134 + } +} \ No newline at end of file diff --git a/results/eval/quant/bnb-fp4/metrics.json b/results/eval/quant/bnb-fp4/metrics.json new file mode 100644 index 0000000..84c0d44 --- /dev/null +++ b/results/eval/quant/bnb-fp4/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.16293893512410998, + "cat_weighted_f1": 0.1746727986514593, + "cat_macro_precision": 0.6289222195093943, + "cat_macro_recall": 0.23220413662370398, + "cat_mcc": 0.22345796853389935, + "cat_auc": 0.8960306312891495, + "cat_ece": 0.2080524676044782, + "cat_confusion_matrix": [ + [ + 8, + 0, + 0, + 0, + 221, + 0, + 1 + ], + [ + 0, + 0, + 0, + 1, + 82, + 0, + 5 + ], + [ + 0, + 0, + 4, + 0, + 145, + 0, + 1 + ], + [ + 0, + 0, + 2, + 3, + 128, + 0, + 3 + ], + [ + 0, + 0, + 0, + 0, + 195, + 0, + 3 + ], + [ + 0, + 0, + 0, + 0, + 208, + 2, + 11 + ], + [ + 0, + 0, + 0, + 0, + 80, + 0, + 97 + ] + ], + "cat_f1_BoardGov": 0.06722689075630252, + "cat_prec_BoardGov": 1.0, + "cat_recall_BoardGov": 0.034782608695652174, + "cat_f1_Incident": 0.0, + "cat_prec_Incident": 0.0, + "cat_recall_Incident": 0.0, + "cat_f1_Manageme": 0.05128205128205128, + "cat_prec_Manageme": 0.6666666666666666, + "cat_recall_Manageme": 0.02666666666666667, + "cat_f1_NoneOthe": 0.04285714285714286, + "cat_prec_NoneOthe": 0.75, + "cat_recall_NoneOthe": 0.022058823529411766, + "cat_f1_RiskMana": 0.31026252983293556, + "cat_prec_RiskMana": 0.18413597733711048, + "cat_recall_RiskMana": 0.9848484848484849, + "cat_f1_Strategy": 0.017937219730941704, + "cat_prec_Strategy": 1.0, + "cat_recall_Strategy": 0.00904977375565611, + "cat_f1_Third-Pa": 0.6510067114093959, + "cat_prec_Third-Pa": 0.8016528925619835, + "cat_recall_Third-Pa": 0.5480225988700564, + "cat_kripp_alpha": -0.08693512028952255, + "spec_macro_f1": 0.20854117827130608, + "spec_weighted_f1": 0.2571301750438355, + "spec_macro_precision": 0.3741612607031285, + "spec_macro_recall": 0.33018440069147115, + "spec_mcc": 0.1895317453505129, + "spec_auc": 0.8110497500610155, + "spec_ece": 0.44289420386155437, + "spec_confusion_matrix": [ + [ + 136, + 473, + 9, + 0 + ], + [ + 4, + 163, + 1, + 0 + ], + [ + 1, + 179, + 27, + 0 + ], + [ + 2, + 171, + 34, + 0 + ] + ], + "spec_f1_L1Generi": 0.35742444152431013, + "spec_prec_L1Generi": 0.951048951048951, + "spec_recall_L1Generi": 0.22006472491909385, + "spec_f1_L2Domain": 0.2824956672443674, + "spec_prec_L2Domain": 0.16531440162271804, + "spec_recall_L2Domain": 0.9702380952380952, + "spec_f1_L3Firm-S": 0.19424460431654678, + "spec_prec_L3Firm-S": 0.38028169014084506, + "spec_recall_L3Firm-S": 0.13043478260869565, + "spec_f1_L4Quanti": 0.0, + "spec_prec_L4Quanti": 0.0, + "spec_recall_L4Quanti": 0.0, + "spec_qwk": 0.2326064604575444, + "spec_mae": 0.8825, + "spec_kripp_alpha": 0.26499611744119067 + }, + "Opus-4.6": { + "cat_macro_f1": 0.16861118726256397, + "cat_weighted_f1": 0.1792365613004711, + "cat_macro_precision": 0.6306758954840335, + "cat_macro_recall": 0.2357303291121537, + "cat_mcc": 0.2251562222131823, + "cat_auc": 0.8995073249291591, + "cat_ece": 0.19888580093781152, + "cat_confusion_matrix": [ + [ + 8, + 0, + 0, + 0, + 205, + 0, + 1 + ], + [ + 0, + 0, + 0, + 1, + 73, + 0, + 5 + ], + [ + 0, + 0, + 4, + 0, + 154, + 0, + 0 + ], + [ + 0, + 0, + 2, + 3, + 133, + 0, + 3 + ], + [ + 0, + 0, + 0, + 0, + 208, + 0, + 5 + ], + [ + 0, + 0, + 0, + 0, + 216, + 2, + 10 + ], + [ + 0, + 0, + 0, + 0, + 70, + 0, + 97 + ] + ], + "cat_f1_BoardGov": 0.07207207207207207, + "cat_prec_BoardGov": 1.0, + "cat_recall_BoardGov": 0.037383177570093455, + "cat_f1_Incident": 0.0, + "cat_prec_Incident": 0.0, + "cat_recall_Incident": 0.0, + "cat_f1_Manageme": 0.04878048780487805, + "cat_prec_Manageme": 0.6666666666666666, + "cat_recall_Manageme": 0.02531645569620253, + "cat_f1_NoneOthe": 0.041379310344827586, + "cat_prec_NoneOthe": 0.75, + "cat_recall_NoneOthe": 0.02127659574468085, + "cat_f1_RiskMana": 0.3270440251572327, + "cat_prec_RiskMana": 0.1964117091595845, + "cat_recall_RiskMana": 0.9765258215962441, + "cat_f1_Strategy": 0.017391304347826087, + "cat_prec_Strategy": 1.0, + "cat_recall_Strategy": 0.008771929824561403, + "cat_f1_Third-Pa": 0.6736111111111112, + "cat_prec_Third-Pa": 0.8016528925619835, + "cat_recall_Third-Pa": 0.5808383233532934, + "cat_kripp_alpha": -0.07941064783948448, + "spec_macro_f1": 0.19783939283519508, + "spec_weighted_f1": 0.24886714543281097, + "spec_macro_precision": 0.37592821714182745, + "spec_macro_recall": 0.3291807330600434, + "spec_mcc": 0.18219176358380398, + "spec_auc": 0.790090253498083, + "spec_ece": 0.45814307530721027, + "spec_confusion_matrix": [ + [ + 132, + 466, + 7, + 0 + ], + [ + 1, + 142, + 2, + 0 + ], + [ + 8, + 221, + 31, + 0 + ], + [ + 2, + 157, + 31, + 0 + ] + ], + "spec_f1_L1Generi": 0.35294117647058826, + "spec_prec_L1Generi": 0.9230769230769231, + "spec_recall_L1Generi": 0.21818181818181817, + "spec_f1_L2Domain": 0.251105216622458, + "spec_prec_L2Domain": 0.1440162271805274, + "spec_recall_L2Domain": 0.9793103448275862, + "spec_f1_L3Firm-S": 0.18731117824773413, + "spec_prec_L3Firm-S": 0.43661971830985913, + "spec_recall_L3Firm-S": 0.11923076923076924, + "spec_f1_L4Quanti": 0.0, + "spec_prec_L4Quanti": 0.0, + "spec_recall_L4Quanti": 0.0, + "spec_qwk": 0.22580295138888895, + "spec_mae": 0.8925, + "spec_kripp_alpha": 0.2579634594689497 + }, + "_runtime": { + "encoder_mb": 274.843904, + "ms_per_sample": 5.865302347471394, + "throughput_per_s": 170.49419463109393, + "peak_vram_mb": 1287.34326171875, + "build_s": 0.4887635139748454 + } +} \ No newline at end of file diff --git a/results/eval/quant/bnb-int8/metrics.json b/results/eval/quant/bnb-int8/metrics.json new file mode 100644 index 0000000..3154686 --- /dev/null +++ b/results/eval/quant/bnb-int8/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.9360988760303737, + "cat_weighted_f1": 0.9367630863906107, + "cat_macro_precision": 0.934342558672944, + "cat_macro_recall": 0.9404157843351134, + "cat_mcc": 0.9256911778959798, + "cat_auc": 0.9918112947607864, + "cat_ece": 0.052939765204985965, + "cat_confusion_matrix": [ + [ + 226, + 0, + 2, + 0, + 2, + 0, + 0 + ], + [ + 0, + 85, + 0, + 0, + 2, + 1, + 0 + ], + [ + 2, + 0, + 145, + 1, + 2, + 0, + 0 + ], + [ + 0, + 0, + 3, + 132, + 0, + 1, + 0 + ], + [ + 6, + 1, + 5, + 19, + 164, + 1, + 2 + ], + [ + 0, + 3, + 1, + 8, + 2, + 207, + 0 + ], + [ + 0, + 0, + 0, + 0, + 12, + 0, + 165 + ] + ], + "cat_f1_BoardGov": 0.9741379310344828, + "cat_prec_BoardGov": 0.9658119658119658, + "cat_recall_BoardGov": 0.9826086956521739, + "cat_f1_Incident": 0.96045197740113, + "cat_prec_Incident": 0.9550561797752809, + "cat_recall_Incident": 0.9659090909090909, + "cat_f1_Manageme": 0.9477124183006536, + "cat_prec_Manageme": 0.9294871794871795, + "cat_recall_Manageme": 0.9666666666666667, + "cat_f1_NoneOthe": 0.8918918918918919, + "cat_prec_NoneOthe": 0.825, + "cat_recall_NoneOthe": 0.9705882352941176, + "cat_f1_RiskMana": 0.8586387434554974, + "cat_prec_RiskMana": 0.8913043478260869, + "cat_recall_RiskMana": 0.8282828282828283, + "cat_f1_Strategy": 0.9605568445475638, + "cat_prec_Strategy": 0.9857142857142858, + "cat_recall_Strategy": 0.9366515837104072, + "cat_f1_Third-Pa": 0.9593023255813954, + "cat_prec_Third-Pa": 0.9880239520958084, + "cat_recall_Third-Pa": 0.9322033898305084, + "cat_kripp_alpha": 0.9253092213149172, + "spec_macro_f1": 0.8986323186392307, + "spec_weighted_f1": 0.9144644120807768, + "spec_macro_precision": 0.9034925881673722, + "spec_macro_recall": 0.8950728490354916, + "spec_mcc": 0.870090391628814, + "spec_auc": 0.98134918835569, + "spec_ece": 0.06740866973996164, + "spec_confusion_matrix": [ + [ + 582, + 19, + 14, + 3 + ], + [ + 29, + 130, + 7, + 2 + ], + [ + 12, + 3, + 190, + 2 + ], + [ + 2, + 1, + 8, + 196 + ] + ], + "spec_f1_L1Generi": 0.9364440868865648, + "spec_prec_L1Generi": 0.9312, + "spec_recall_L1Generi": 0.941747572815534, + "spec_f1_L2Domain": 0.8099688473520249, + "spec_prec_L2Domain": 0.8496732026143791, + "spec_recall_L2Domain": 0.7738095238095238, + "spec_f1_L3Firm-S": 0.892018779342723, + "spec_prec_L3Firm-S": 0.867579908675799, + "spec_recall_L3Firm-S": 0.9178743961352657, + "spec_f1_L4Quanti": 0.9560975609756097, + "spec_prec_L4Quanti": 0.9655172413793104, + "spec_recall_L4Quanti": 0.9468599033816425, + "spec_qwk": 0.9307948020550015, + "spec_mae": 0.1175, + "spec_kripp_alpha": 0.9166492249745117 + }, + "Opus-4.6": { + "cat_macro_f1": 0.9235105849558979, + "cat_weighted_f1": 0.9224780370334836, + "cat_macro_precision": 0.9187130112710481, + "cat_macro_recall": 0.9326192612354074, + "cat_mcc": 0.9103198007176273, + "cat_auc": 0.9937246318315877, + "cat_ece": 0.06465620135267579, + "cat_confusion_matrix": [ + [ + 211, + 0, + 1, + 1, + 1, + 0, + 0 + ], + [ + 0, + 78, + 0, + 0, + 1, + 0, + 0 + ], + [ + 9, + 0, + 144, + 1, + 3, + 0, + 1 + ], + [ + 0, + 0, + 1, + 139, + 1, + 0, + 0 + ], + [ + 13, + 0, + 9, + 14, + 169, + 1, + 7 + ], + [ + 1, + 11, + 1, + 4, + 3, + 208, + 0 + ], + [ + 0, + 0, + 0, + 1, + 6, + 1, + 159 + ] + ], + "cat_f1_BoardGov": 0.9419642857142857, + "cat_prec_BoardGov": 0.9017094017094017, + "cat_recall_BoardGov": 0.985981308411215, + "cat_f1_Incident": 0.9285714285714286, + "cat_prec_Incident": 0.8764044943820225, + "cat_recall_Incident": 0.9873417721518988, + "cat_f1_Manageme": 0.9171974522292994, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.9113924050632911, + "cat_f1_NoneOthe": 0.9235880398671097, + "cat_prec_NoneOthe": 0.86875, + "cat_recall_NoneOthe": 0.9858156028368794, + "cat_f1_RiskMana": 0.8513853904282116, + "cat_prec_RiskMana": 0.9184782608695652, + "cat_recall_RiskMana": 0.7934272300469484, + "cat_f1_Strategy": 0.9497716894977168, + "cat_prec_Strategy": 0.9904761904761905, + "cat_recall_Strategy": 0.9122807017543859, + "cat_f1_Third-Pa": 0.9520958083832335, + "cat_prec_Third-Pa": 0.9520958083832335, + "cat_recall_Third-Pa": 0.9520958083832335, + "cat_kripp_alpha": 0.9095619506866199, + "spec_macro_f1": 0.8826923642825633, + "spec_weighted_f1": 0.8991699562480843, + "spec_macro_precision": 0.8862949086294886, + "spec_macro_recall": 0.8831960153359262, + "spec_mcc": 0.8485449936701916, + "spec_auc": 0.9725823165743999, + "spec_ece": 0.083350846717755, + "spec_confusion_matrix": [ + [ + 568, + 27, + 9, + 1 + ], + [ + 23, + 117, + 3, + 2 + ], + [ + 34, + 9, + 206, + 11 + ], + [ + 0, + 0, + 1, + 189 + ] + ], + "spec_f1_L1Generi": 0.9235772357723577, + "spec_prec_L1Generi": 0.9088, + "spec_recall_L1Generi": 0.9388429752066115, + "spec_f1_L2Domain": 0.785234899328859, + "spec_prec_L2Domain": 0.7647058823529411, + "spec_recall_L2Domain": 0.8068965517241379, + "spec_f1_L3Firm-S": 0.860125260960334, + "spec_prec_L3Firm-S": 0.9406392694063926, + "spec_recall_L3Firm-S": 0.7923076923076923, + "spec_f1_L4Quanti": 0.9618320610687023, + "spec_prec_L4Quanti": 0.9310344827586207, + "spec_recall_L4Quanti": 0.9947368421052631, + "spec_qwk": 0.9198415117342273, + "spec_mae": 0.13916666666666666, + "spec_kripp_alpha": 0.9038906079654127 + }, + "_runtime": { + "encoder_mb": 789.563648, + "ms_per_sample": 7.762363941583317, + "throughput_per_s": 128.82673468103667, + "peak_vram_mb": 2135.203125, + "build_s": 1.1878160500200465 + } +} \ No newline at end of file diff --git a/results/eval/quant/bnb-nf4-nodq/metrics.json b/results/eval/quant/bnb-nf4-nodq/metrics.json new file mode 100644 index 0000000..110cffb --- /dev/null +++ b/results/eval/quant/bnb-nf4-nodq/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.3536909012886116, + "cat_weighted_f1": 0.4058815979606338, + "cat_macro_precision": 0.6317997184487815, + "cat_macro_recall": 0.38979766446605063, + "cat_mcc": 0.42471542150657926, + "cat_auc": 0.9205800077405307, + "cat_ece": 0.09734637491405013, + "cat_confusion_matrix": [ + [ + 143, + 0, + 0, + 0, + 85, + 0, + 2 + ], + [ + 0, + 0, + 0, + 4, + 73, + 0, + 11 + ], + [ + 3, + 0, + 20, + 1, + 124, + 0, + 2 + ], + [ + 1, + 0, + 1, + 5, + 122, + 0, + 7 + ], + [ + 0, + 0, + 0, + 0, + 185, + 0, + 13 + ], + [ + 0, + 0, + 0, + 0, + 180, + 28, + 13 + ], + [ + 0, + 0, + 0, + 0, + 22, + 0, + 155 + ] + ], + "cat_f1_BoardGov": 0.7586206896551724, + "cat_prec_BoardGov": 0.9727891156462585, + "cat_recall_BoardGov": 0.6217391304347826, + "cat_f1_Incident": 0.0, + "cat_prec_Incident": 0.0, + "cat_recall_Incident": 0.0, + "cat_f1_Manageme": 0.23391812865497075, + "cat_prec_Manageme": 0.9523809523809523, + "cat_recall_Manageme": 0.13333333333333333, + "cat_f1_NoneOthe": 0.0684931506849315, + "cat_prec_NoneOthe": 0.5, + "cat_recall_NoneOthe": 0.03676470588235294, + "cat_f1_RiskMana": 0.3741152679474216, + "cat_prec_RiskMana": 0.23388116308470291, + "cat_recall_RiskMana": 0.9343434343434344, + "cat_f1_Strategy": 0.2248995983935743, + "cat_prec_Strategy": 1.0, + "cat_recall_Strategy": 0.12669683257918551, + "cat_f1_Third-Pa": 0.8157894736842105, + "cat_prec_Third-Pa": 0.7635467980295566, + "cat_recall_Third-Pa": 0.8757062146892656, + "cat_kripp_alpha": 0.27180867501339423, + "spec_macro_f1": 0.22049451330952025, + "spec_weighted_f1": 0.26278390857815354, + "spec_macro_precision": 0.4075440073341987, + "spec_macro_recall": 0.34148466970860386, + "spec_mcc": 0.20939315966102864, + "spec_auc": 0.8490039116946011, + "spec_ece": 0.43363295723994577, + "spec_confusion_matrix": [ + [ + 132, + 483, + 3, + 0 + ], + [ + 2, + 166, + 0, + 0 + ], + [ + 2, + 171, + 34, + 0 + ], + [ + 0, + 175, + 32, + 0 + ] + ], + "spec_f1_L1Generi": 0.35013262599469497, + "spec_prec_L1Generi": 0.9705882352941176, + "spec_recall_L1Generi": 0.21359223300970873, + "spec_f1_L2Domain": 0.28546861564918313, + "spec_prec_L2Domain": 0.16683417085427135, + "spec_recall_L2Domain": 0.9880952380952381, + "spec_f1_L3Firm-S": 0.2463768115942029, + "spec_prec_L3Firm-S": 0.4927536231884058, + "spec_recall_L3Firm-S": 0.1642512077294686, + "spec_f1_L4Quanti": 0.0, + "spec_prec_L4Quanti": 0.0, + "spec_recall_L4Quanti": 0.0, + "spec_qwk": 0.24233251808742773, + "spec_mae": 0.8733333333333333, + "spec_kripp_alpha": 0.2761091078775676 + }, + "Opus-4.6": { + "cat_macro_f1": 0.35763512449392704, + "cat_weighted_f1": 0.40173099854659305, + "cat_macro_precision": 0.6354693148020794, + "cat_macro_recall": 0.39500680662311666, + "cat_mcc": 0.42166882753874363, + "cat_auc": 0.9209441610065957, + "cat_ece": 0.09567970824738346, + "cat_confusion_matrix": [ + [ + 141, + 0, + 0, + 0, + 71, + 0, + 2 + ], + [ + 0, + 0, + 0, + 4, + 65, + 0, + 10 + ], + [ + 5, + 0, + 21, + 1, + 131, + 0, + 0 + ], + [ + 1, + 0, + 0, + 5, + 128, + 0, + 7 + ], + [ + 0, + 0, + 0, + 0, + 194, + 0, + 19 + ], + [ + 0, + 0, + 0, + 0, + 186, + 28, + 14 + ], + [ + 0, + 0, + 0, + 0, + 16, + 0, + 151 + ] + ], + "cat_f1_BoardGov": 0.7811634349030471, + "cat_prec_BoardGov": 0.9591836734693877, + "cat_recall_BoardGov": 0.6588785046728972, + "cat_f1_Incident": 0.0, + "cat_prec_Incident": 0.0, + "cat_recall_Incident": 0.0, + "cat_f1_Manageme": 0.2346368715083799, + "cat_prec_Manageme": 1.0, + "cat_recall_Manageme": 0.13291139240506328, + "cat_f1_NoneOthe": 0.06622516556291391, + "cat_prec_NoneOthe": 0.5, + "cat_recall_NoneOthe": 0.03546099290780142, + "cat_f1_RiskMana": 0.38645418326693226, + "cat_prec_RiskMana": 0.24525916561314792, + "cat_recall_RiskMana": 0.9107981220657277, + "cat_f1_Strategy": 0.21875, + "cat_prec_Strategy": 1.0, + "cat_recall_Strategy": 0.12280701754385964, + "cat_f1_Third-Pa": 0.8162162162162162, + "cat_prec_Third-Pa": 0.7438423645320197, + "cat_recall_Third-Pa": 0.9041916167664671, + "cat_kripp_alpha": 0.27338793761748126, + "spec_macro_f1": 0.20754679251319788, + "spec_weighted_f1": 0.25637242485646744, + "spec_macro_precision": 0.40946072005380696, + "spec_macro_recall": 0.33929593134138586, + "spec_mcc": 0.2041103760829744, + "spec_auc": 0.8271022317290393, + "spec_ece": 0.4489923599362374, + "spec_confusion_matrix": [ + [ + 130, + 473, + 2, + 0 + ], + [ + 0, + 145, + 0, + 0 + ], + [ + 6, + 217, + 37, + 0 + ], + [ + 0, + 160, + 30, + 0 + ] + ], + "spec_f1_L1Generi": 0.3508771929824561, + "spec_prec_L1Generi": 0.9558823529411765, + "spec_recall_L1Generi": 0.21487603305785125, + "spec_f1_L2Domain": 0.2543859649122807, + "spec_prec_L2Domain": 0.1457286432160804, + "spec_recall_L2Domain": 1.0, + "spec_f1_L3Firm-S": 0.22492401215805471, + "spec_prec_L3Firm-S": 0.5362318840579711, + "spec_recall_L3Firm-S": 0.1423076923076923, + "spec_f1_L4Quanti": 0.0, + "spec_prec_L4Quanti": 0.0, + "spec_recall_L4Quanti": 0.0, + "spec_qwk": 0.24096533359991634, + "spec_mae": 0.88, + "spec_kripp_alpha": 0.2758412395136435 + }, + "_runtime": { + "encoder_mb": 274.843904, + "ms_per_sample": 5.861402786540566, + "throughput_per_s": 170.607623536175, + "peak_vram_mb": 1287.34326171875, + "build_s": 0.4908116469741799 + } +} \ No newline at end of file diff --git a/results/eval/quant/bnb-nf4/metrics.json b/results/eval/quant/bnb-nf4/metrics.json new file mode 100644 index 0000000..f3f9cef --- /dev/null +++ b/results/eval/quant/bnb-nf4/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.3536909012886116, + "cat_weighted_f1": 0.4058815979606338, + "cat_macro_precision": 0.6317997184487815, + "cat_macro_recall": 0.38979766446605063, + "cat_mcc": 0.42471542150657926, + "cat_auc": 0.9205800077405307, + "cat_ece": 0.09734637491405013, + "cat_confusion_matrix": [ + [ + 143, + 0, + 0, + 0, + 85, + 0, + 2 + ], + [ + 0, + 0, + 0, + 4, + 73, + 0, + 11 + ], + [ + 3, + 0, + 20, + 1, + 124, + 0, + 2 + ], + [ + 1, + 0, + 1, + 5, + 122, + 0, + 7 + ], + [ + 0, + 0, + 0, + 0, + 185, + 0, + 13 + ], + [ + 0, + 0, + 0, + 0, + 180, + 28, + 13 + ], + [ + 0, + 0, + 0, + 0, + 22, + 0, + 155 + ] + ], + "cat_f1_BoardGov": 0.7586206896551724, + "cat_prec_BoardGov": 0.9727891156462585, + "cat_recall_BoardGov": 0.6217391304347826, + "cat_f1_Incident": 0.0, + "cat_prec_Incident": 0.0, + "cat_recall_Incident": 0.0, + "cat_f1_Manageme": 0.23391812865497075, + "cat_prec_Manageme": 0.9523809523809523, + "cat_recall_Manageme": 0.13333333333333333, + "cat_f1_NoneOthe": 0.0684931506849315, + "cat_prec_NoneOthe": 0.5, + "cat_recall_NoneOthe": 0.03676470588235294, + "cat_f1_RiskMana": 0.3741152679474216, + "cat_prec_RiskMana": 0.23388116308470291, + "cat_recall_RiskMana": 0.9343434343434344, + "cat_f1_Strategy": 0.2248995983935743, + "cat_prec_Strategy": 1.0, + "cat_recall_Strategy": 0.12669683257918551, + "cat_f1_Third-Pa": 0.8157894736842105, + "cat_prec_Third-Pa": 0.7635467980295566, + "cat_recall_Third-Pa": 0.8757062146892656, + "cat_kripp_alpha": 0.27180867501339423, + "spec_macro_f1": 0.22049451330952025, + "spec_weighted_f1": 0.26278390857815354, + "spec_macro_precision": 0.4075440073341987, + "spec_macro_recall": 0.34148466970860386, + "spec_mcc": 0.20939315966102864, + "spec_auc": 0.8490039116946011, + "spec_ece": 0.43363295723994577, + "spec_confusion_matrix": [ + [ + 132, + 483, + 3, + 0 + ], + [ + 2, + 166, + 0, + 0 + ], + [ + 2, + 171, + 34, + 0 + ], + [ + 0, + 175, + 32, + 0 + ] + ], + "spec_f1_L1Generi": 0.35013262599469497, + "spec_prec_L1Generi": 0.9705882352941176, + "spec_recall_L1Generi": 0.21359223300970873, + "spec_f1_L2Domain": 0.28546861564918313, + "spec_prec_L2Domain": 0.16683417085427135, + "spec_recall_L2Domain": 0.9880952380952381, + "spec_f1_L3Firm-S": 0.2463768115942029, + "spec_prec_L3Firm-S": 0.4927536231884058, + "spec_recall_L3Firm-S": 0.1642512077294686, + "spec_f1_L4Quanti": 0.0, + "spec_prec_L4Quanti": 0.0, + "spec_recall_L4Quanti": 0.0, + "spec_qwk": 0.24233251808742773, + "spec_mae": 0.8733333333333333, + "spec_kripp_alpha": 0.2761091078775676 + }, + "Opus-4.6": { + "cat_macro_f1": 0.35763512449392704, + "cat_weighted_f1": 0.40173099854659305, + "cat_macro_precision": 0.6354693148020794, + "cat_macro_recall": 0.39500680662311666, + "cat_mcc": 0.42166882753874363, + "cat_auc": 0.9209441610065957, + "cat_ece": 0.09567970824738346, + "cat_confusion_matrix": [ + [ + 141, + 0, + 0, + 0, + 71, + 0, + 2 + ], + [ + 0, + 0, + 0, + 4, + 65, + 0, + 10 + ], + [ + 5, + 0, + 21, + 1, + 131, + 0, + 0 + ], + [ + 1, + 0, + 0, + 5, + 128, + 0, + 7 + ], + [ + 0, + 0, + 0, + 0, + 194, + 0, + 19 + ], + [ + 0, + 0, + 0, + 0, + 186, + 28, + 14 + ], + [ + 0, + 0, + 0, + 0, + 16, + 0, + 151 + ] + ], + "cat_f1_BoardGov": 0.7811634349030471, + "cat_prec_BoardGov": 0.9591836734693877, + "cat_recall_BoardGov": 0.6588785046728972, + "cat_f1_Incident": 0.0, + "cat_prec_Incident": 0.0, + "cat_recall_Incident": 0.0, + "cat_f1_Manageme": 0.2346368715083799, + "cat_prec_Manageme": 1.0, + "cat_recall_Manageme": 0.13291139240506328, + "cat_f1_NoneOthe": 0.06622516556291391, + "cat_prec_NoneOthe": 0.5, + "cat_recall_NoneOthe": 0.03546099290780142, + "cat_f1_RiskMana": 0.38645418326693226, + "cat_prec_RiskMana": 0.24525916561314792, + "cat_recall_RiskMana": 0.9107981220657277, + "cat_f1_Strategy": 0.21875, + "cat_prec_Strategy": 1.0, + "cat_recall_Strategy": 0.12280701754385964, + "cat_f1_Third-Pa": 0.8162162162162162, + "cat_prec_Third-Pa": 0.7438423645320197, + "cat_recall_Third-Pa": 0.9041916167664671, + "cat_kripp_alpha": 0.27338793761748126, + "spec_macro_f1": 0.20754679251319788, + "spec_weighted_f1": 0.25637242485646744, + "spec_macro_precision": 0.40946072005380696, + "spec_macro_recall": 0.33929593134138586, + "spec_mcc": 0.2041103760829744, + "spec_auc": 0.8271022317290393, + "spec_ece": 0.4489923599362374, + "spec_confusion_matrix": [ + [ + 130, + 473, + 2, + 0 + ], + [ + 0, + 145, + 0, + 0 + ], + [ + 6, + 217, + 37, + 0 + ], + [ + 0, + 160, + 30, + 0 + ] + ], + "spec_f1_L1Generi": 0.3508771929824561, + "spec_prec_L1Generi": 0.9558823529411765, + "spec_recall_L1Generi": 0.21487603305785125, + "spec_f1_L2Domain": 0.2543859649122807, + "spec_prec_L2Domain": 0.1457286432160804, + "spec_recall_L2Domain": 1.0, + "spec_f1_L3Firm-S": 0.22492401215805471, + "spec_prec_L3Firm-S": 0.5362318840579711, + "spec_recall_L3Firm-S": 0.1423076923076923, + "spec_f1_L4Quanti": 0.0, + "spec_prec_L4Quanti": 0.0, + "spec_recall_L4Quanti": 0.0, + "spec_qwk": 0.24096533359991634, + "spec_mae": 0.88, + "spec_kripp_alpha": 0.2758412395136435 + }, + "_runtime": { + "encoder_mb": 274.843904, + "ms_per_sample": 5.860076693982895, + "throughput_per_s": 170.64623079537446, + "peak_vram_mb": 1287.34326171875, + "build_s": 0.4858604749897495 + } +} \ No newline at end of file diff --git a/results/eval/quant/fp16/metrics.json b/results/eval/quant/fp16/metrics.json new file mode 100644 index 0000000..b7f8988 --- /dev/null +++ b/results/eval/quant/fp16/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.9336741161693523, + "cat_weighted_f1": 0.9343162998643407, + "cat_macro_precision": 0.93189297179766, + "cat_macro_recall": 0.9377918652022429, + "cat_mcc": 0.9226990724708704, + "cat_auc": 0.9920503365435541, + "cat_ece": 0.053747650533914546, + "cat_confusion_matrix": [ + [ + 225, + 0, + 3, + 0, + 2, + 0, + 0 + ], + [ + 0, + 85, + 0, + 0, + 2, + 1, + 0 + ], + [ + 2, + 0, + 144, + 1, + 3, + 0, + 0 + ], + [ + 0, + 0, + 3, + 131, + 0, + 2, + 0 + ], + [ + 6, + 1, + 5, + 19, + 164, + 1, + 2 + ], + [ + 0, + 3, + 1, + 8, + 2, + 207, + 0 + ], + [ + 0, + 0, + 0, + 0, + 12, + 0, + 165 + ] + ], + "cat_f1_BoardGov": 0.9719222462203023, + "cat_prec_BoardGov": 0.9656652360515021, + "cat_recall_BoardGov": 0.9782608695652174, + "cat_f1_Incident": 0.96045197740113, + "cat_prec_Incident": 0.9550561797752809, + "cat_recall_Incident": 0.9659090909090909, + "cat_f1_Manageme": 0.9411764705882353, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.96, + "cat_f1_NoneOthe": 0.888135593220339, + "cat_prec_NoneOthe": 0.8238993710691824, + "cat_recall_NoneOthe": 0.9632352941176471, + "cat_f1_RiskMana": 0.856396866840731, + "cat_prec_RiskMana": 0.8864864864864865, + "cat_recall_RiskMana": 0.8282828282828283, + "cat_f1_Strategy": 0.9583333333333334, + "cat_prec_Strategy": 0.981042654028436, + "cat_recall_Strategy": 0.9366515837104072, + "cat_f1_Third-Pa": 0.9593023255813954, + "cat_prec_Third-Pa": 0.9880239520958084, + "cat_recall_Third-Pa": 0.9322033898305084, + "cat_kripp_alpha": 0.9223591517560865, + "spec_macro_f1": 0.8951731906425856, + "spec_weighted_f1": 0.9121524819510628, + "spec_macro_precision": 0.8980417155129858, + "spec_macro_recall": 0.8930560580782194, + "spec_mcc": 0.866381831963237, + "spec_auc": 0.981767664615518, + "spec_ece": 0.07004868157207966, + "spec_confusion_matrix": [ + [ + 580, + 23, + 12, + 3 + ], + [ + 29, + 130, + 7, + 2 + ], + [ + 11, + 4, + 190, + 2 + ], + [ + 2, + 1, + 9, + 195 + ] + ], + "spec_f1_L1Generi": 0.9354838709677419, + "spec_prec_L1Generi": 0.932475884244373, + "spec_recall_L1Generi": 0.9385113268608414, + "spec_f1_L2Domain": 0.7975460122699386, + "spec_prec_L2Domain": 0.8227848101265823, + "spec_recall_L2Domain": 0.7738095238095238, + "spec_f1_L3Firm-S": 0.8941176470588236, + "spec_prec_L3Firm-S": 0.8715596330275229, + "spec_recall_L3Firm-S": 0.9178743961352657, + "spec_f1_L4Quanti": 0.9535452322738386, + "spec_prec_L4Quanti": 0.9653465346534653, + "spec_recall_L4Quanti": 0.9420289855072463, + "spec_qwk": 0.9324447137231142, + "spec_mae": 0.1175, + "spec_kripp_alpha": 0.917725722448833 + }, + "Opus-4.6": { + "cat_macro_f1": 0.922684387023173, + "cat_weighted_f1": 0.9216414809666168, + "cat_macro_precision": 0.9177680939029339, + "cat_macro_recall": 0.9316060900094703, + "cat_mcc": 0.909266938399113, + "cat_auc": 0.993963602835296, + "cat_ece": 0.06541431720058125, + "cat_confusion_matrix": [ + [ + 211, + 0, + 1, + 1, + 1, + 0, + 0 + ], + [ + 0, + 78, + 0, + 0, + 1, + 0, + 0 + ], + [ + 8, + 0, + 144, + 1, + 4, + 0, + 1 + ], + [ + 0, + 0, + 1, + 138, + 1, + 1, + 0 + ], + [ + 13, + 0, + 9, + 14, + 169, + 1, + 7 + ], + [ + 1, + 11, + 1, + 4, + 3, + 208, + 0 + ], + [ + 0, + 0, + 0, + 1, + 6, + 1, + 159 + ] + ], + "cat_f1_BoardGov": 0.9440715883668904, + "cat_prec_BoardGov": 0.9055793991416309, + "cat_recall_BoardGov": 0.985981308411215, + "cat_f1_Incident": 0.9285714285714286, + "cat_prec_Incident": 0.8764044943820225, + "cat_recall_Incident": 0.9873417721518988, + "cat_f1_Manageme": 0.9171974522292994, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.9113924050632911, + "cat_f1_NoneOthe": 0.92, + "cat_prec_NoneOthe": 0.8679245283018868, + "cat_recall_NoneOthe": 0.9787234042553191, + "cat_f1_RiskMana": 0.8492462311557789, + "cat_prec_RiskMana": 0.9135135135135135, + "cat_recall_RiskMana": 0.7934272300469484, + "cat_f1_Strategy": 0.9476082004555809, + "cat_prec_Strategy": 0.985781990521327, + "cat_recall_Strategy": 0.9122807017543859, + "cat_f1_Third-Pa": 0.9520958083832335, + "cat_prec_Third-Pa": 0.9520958083832335, + "cat_recall_Third-Pa": 0.9520958083832335, + "cat_kripp_alpha": 0.908575631724203, + "spec_macro_f1": 0.8833694419146193, + "spec_weighted_f1": 0.9004034318676798, + "spec_macro_precision": 0.8858989636247611, + "spec_macro_recall": 0.8854684685880032, + "spec_mcc": 0.8500778641433316, + "spec_auc": 0.9736589116420353, + "spec_ece": 0.0816012116521597, + "spec_confusion_matrix": [ + [ + 567, + 30, + 7, + 1 + ], + [ + 22, + 118, + 3, + 2 + ], + [ + 33, + 10, + 207, + 10 + ], + [ + 0, + 0, + 1, + 189 + ] + ], + "spec_f1_L1Generi": 0.9242053789731052, + "spec_prec_L1Generi": 0.9115755627009646, + "spec_recall_L1Generi": 0.9371900826446281, + "spec_f1_L2Domain": 0.7788778877887789, + "spec_prec_L2Domain": 0.7468354430379747, + "spec_recall_L2Domain": 0.8137931034482758, + "spec_f1_L3Firm-S": 0.8661087866108786, + "spec_prec_L3Firm-S": 0.9495412844036697, + "spec_recall_L3Firm-S": 0.7961538461538461, + "spec_f1_L4Quanti": 0.9642857142857143, + "spec_prec_L4Quanti": 0.9356435643564357, + "spec_recall_L4Quanti": 0.9947368421052631, + "spec_qwk": 0.9227008860372746, + "spec_mae": 0.13583333333333333, + "spec_kripp_alpha": 0.9065248741550552 + }, + "_runtime": { + "encoder_mb": 789.563648, + "ms_per_sample": 5.539002780715236, + "throughput_per_s": 180.53791261517884, + "peak_vram_mb": 1740.83837890625, + "build_s": 0.46582157304510474 + } +} \ No newline at end of file diff --git a/results/eval/quant/fp32/metrics.json b/results/eval/quant/fp32/metrics.json new file mode 100644 index 0000000..1bc3ea7 --- /dev/null +++ b/results/eval/quant/fp32/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.9336741161693523, + "cat_weighted_f1": 0.9343162998643407, + "cat_macro_precision": 0.93189297179766, + "cat_macro_recall": 0.9377918652022429, + "cat_mcc": 0.9226990724708704, + "cat_auc": 0.9920546854722492, + "cat_ece": 0.05388230005900064, + "cat_confusion_matrix": [ + [ + 225, + 0, + 3, + 0, + 2, + 0, + 0 + ], + [ + 0, + 85, + 0, + 0, + 2, + 1, + 0 + ], + [ + 2, + 0, + 144, + 1, + 3, + 0, + 0 + ], + [ + 0, + 0, + 3, + 131, + 0, + 2, + 0 + ], + [ + 6, + 1, + 5, + 19, + 164, + 1, + 2 + ], + [ + 0, + 3, + 1, + 8, + 2, + 207, + 0 + ], + [ + 0, + 0, + 0, + 0, + 12, + 0, + 165 + ] + ], + "cat_f1_BoardGov": 0.9719222462203023, + "cat_prec_BoardGov": 0.9656652360515021, + "cat_recall_BoardGov": 0.9782608695652174, + "cat_f1_Incident": 0.96045197740113, + "cat_prec_Incident": 0.9550561797752809, + "cat_recall_Incident": 0.9659090909090909, + "cat_f1_Manageme": 0.9411764705882353, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.96, + "cat_f1_NoneOthe": 0.888135593220339, + "cat_prec_NoneOthe": 0.8238993710691824, + "cat_recall_NoneOthe": 0.9632352941176471, + "cat_f1_RiskMana": 0.856396866840731, + "cat_prec_RiskMana": 0.8864864864864865, + "cat_recall_RiskMana": 0.8282828282828283, + "cat_f1_Strategy": 0.9583333333333334, + "cat_prec_Strategy": 0.981042654028436, + "cat_recall_Strategy": 0.9366515837104072, + "cat_f1_Third-Pa": 0.9593023255813954, + "cat_prec_Third-Pa": 0.9880239520958084, + "cat_recall_Third-Pa": 0.9322033898305084, + "cat_kripp_alpha": 0.9223591517560865, + "spec_macro_f1": 0.8943486525770918, + "spec_weighted_f1": 0.9113685505226937, + "spec_macro_precision": 0.896720845083131, + "spec_macro_recall": 0.8926515273338829, + "spec_mcc": 0.8651595302642376, + "spec_auc": 0.9817804600696421, + "spec_ece": 0.07088303024570146, + "spec_confusion_matrix": [ + [ + 579, + 24, + 12, + 3 + ], + [ + 29, + 130, + 7, + 2 + ], + [ + 11, + 4, + 190, + 2 + ], + [ + 2, + 1, + 9, + 195 + ] + ], + "spec_f1_L1Generi": 0.9346246973365617, + "spec_prec_L1Generi": 0.9323671497584541, + "spec_recall_L1Generi": 0.9368932038834952, + "spec_f1_L2Domain": 0.7951070336391437, + "spec_prec_L2Domain": 0.8176100628930818, + "spec_recall_L2Domain": 0.7738095238095238, + "spec_f1_L3Firm-S": 0.8941176470588236, + "spec_prec_L3Firm-S": 0.8715596330275229, + "spec_recall_L3Firm-S": 0.9178743961352657, + "spec_f1_L4Quanti": 0.9535452322738386, + "spec_prec_L4Quanti": 0.9653465346534653, + "spec_recall_L4Quanti": 0.9420289855072463, + "spec_qwk": 0.9321211092744079, + "spec_mae": 0.11833333333333333, + "spec_kripp_alpha": 0.9170875429859872 + }, + "Opus-4.6": { + "cat_macro_f1": 0.922684387023173, + "cat_weighted_f1": 0.9216414809666168, + "cat_macro_precision": 0.9177680939029339, + "cat_macro_recall": 0.9316060900094703, + "cat_mcc": 0.909266938399113, + "cat_auc": 0.9939567083286731, + "cat_ece": 0.06541596949100496, + "cat_confusion_matrix": [ + [ + 211, + 0, + 1, + 1, + 1, + 0, + 0 + ], + [ + 0, + 78, + 0, + 0, + 1, + 0, + 0 + ], + [ + 8, + 0, + 144, + 1, + 4, + 0, + 1 + ], + [ + 0, + 0, + 1, + 138, + 1, + 1, + 0 + ], + [ + 13, + 0, + 9, + 14, + 169, + 1, + 7 + ], + [ + 1, + 11, + 1, + 4, + 3, + 208, + 0 + ], + [ + 0, + 0, + 0, + 1, + 6, + 1, + 159 + ] + ], + "cat_f1_BoardGov": 0.9440715883668904, + "cat_prec_BoardGov": 0.9055793991416309, + "cat_recall_BoardGov": 0.985981308411215, + "cat_f1_Incident": 0.9285714285714286, + "cat_prec_Incident": 0.8764044943820225, + "cat_recall_Incident": 0.9873417721518988, + "cat_f1_Manageme": 0.9171974522292994, + "cat_prec_Manageme": 0.9230769230769231, + "cat_recall_Manageme": 0.9113924050632911, + "cat_f1_NoneOthe": 0.92, + "cat_prec_NoneOthe": 0.8679245283018868, + "cat_recall_NoneOthe": 0.9787234042553191, + "cat_f1_RiskMana": 0.8492462311557789, + "cat_prec_RiskMana": 0.9135135135135135, + "cat_recall_RiskMana": 0.7934272300469484, + "cat_f1_Strategy": 0.9476082004555809, + "cat_prec_Strategy": 0.985781990521327, + "cat_recall_Strategy": 0.9122807017543859, + "cat_f1_Third-Pa": 0.9520958083832335, + "cat_prec_Third-Pa": 0.9520958083832335, + "cat_recall_Third-Pa": 0.9520958083832335, + "cat_kripp_alpha": 0.908575631724203, + "spec_macro_f1": 0.8825095464914274, + "spec_weighted_f1": 0.8996514471183623, + "spec_macro_precision": 0.8846890964606792, + "spec_macro_recall": 0.8850552454475074, + "spec_mcc": 0.8488763096810703, + "spec_auc": 0.9736482774372809, + "spec_ece": 0.08238246644536655, + "spec_confusion_matrix": [ + [ + 566, + 31, + 7, + 1 + ], + [ + 22, + 118, + 3, + 2 + ], + [ + 33, + 10, + 207, + 10 + ], + [ + 0, + 0, + 1, + 189 + ] + ], + "spec_f1_L1Generi": 0.9233278955954323, + "spec_prec_L1Generi": 0.9114331723027376, + "spec_recall_L1Generi": 0.9355371900826446, + "spec_f1_L2Domain": 0.7763157894736842, + "spec_prec_L2Domain": 0.7421383647798742, + "spec_recall_L2Domain": 0.8137931034482758, + "spec_f1_L3Firm-S": 0.8661087866108786, + "spec_prec_L3Firm-S": 0.9495412844036697, + "spec_recall_L3Firm-S": 0.7961538461538461, + "spec_f1_L4Quanti": 0.9642857142857143, + "spec_prec_L4Quanti": 0.9356435643564357, + "spec_recall_L4Quanti": 0.9947368421052631, + "spec_qwk": 0.9223702541559166, + "spec_mae": 0.13666666666666666, + "spec_kripp_alpha": 0.9059072309806726 + }, + "_runtime": { + "encoder_mb": 1579.127296, + "ms_per_sample": 16.293709366727853, + "throughput_per_s": 61.37337898281309, + "peak_vram_mb": 3503.53369140625, + "build_s": 0.6251941699883901 + } +} \ No newline at end of file diff --git a/results/eval/quant/summary.json b/results/eval/quant/summary.json new file mode 100644 index 0000000..4afddf9 --- /dev/null +++ b/results/eval/quant/summary.json @@ -0,0 +1,286 @@ +[ + { + "variant": "fp32", + "description": "Float32 encoder + heads", + "encoder_mb": 1579.127296, + "ms_per_sample": 16.293709366727853, + "throughput_per_s": 61.37337898281309, + "peak_vram_mb": 3503.53369140625, + "build_s": 0.6251941699883901, + "GPT-5.4_cat_f1": 0.9336741161693523, + "GPT-5.4_spec_f1": 0.8943486525770918, + "GPT-5.4_cat_mcc": 0.9226990724708704, + "GPT-5.4_spec_qwk": 0.9321211092744079, + "GPT-5.4_spec_mae": 0.11833333333333333, + "GPT-5.4_cat_ece": 0.05388230005900064, + "GPT-5.4_spec_ece": 0.07088303024570146, + "GPT-5.4_spec_f1_L1Generi": 0.9346246973365617, + "GPT-5.4_spec_f1_L2Domain": 0.7951070336391437, + "GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236, + "GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386, + "Opus-4.6_cat_f1": 0.922684387023173, + "Opus-4.6_spec_f1": 0.8825095464914274, + "Opus-4.6_cat_mcc": 0.909266938399113, + "Opus-4.6_spec_qwk": 0.9223702541559166, + "Opus-4.6_spec_mae": 0.13666666666666666, + "Opus-4.6_cat_ece": 0.06541596949100496, + "Opus-4.6_spec_ece": 0.08238246644536655, + "Opus-4.6_spec_f1_L1Generi": 0.9233278955954323, + "Opus-4.6_spec_f1_L2Domain": 0.7763157894736842, + "Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786, + "Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143 + }, + { + "variant": "bf16", + "description": "BFloat16 baseline (matches eval pipeline)", + "encoder_mb": 789.563648, + "ms_per_sample": 5.516677870764397, + "throughput_per_s": 181.26851402716375, + "peak_vram_mb": 1740.83837890625, + "build_s": 0.48778308398323134, + "GPT-5.4_cat_f1": 0.9336741161693523, + "GPT-5.4_spec_f1": 0.8951731906425856, + "GPT-5.4_cat_mcc": 0.9226990724708704, + "GPT-5.4_spec_qwk": 0.9324447137231142, + "GPT-5.4_spec_mae": 0.1175, + "GPT-5.4_cat_ece": 0.053848127176364245, + "GPT-5.4_spec_ece": 0.07135417198141418, + "GPT-5.4_spec_f1_L1Generi": 0.9354838709677419, + "GPT-5.4_spec_f1_L2Domain": 0.7975460122699386, + "GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236, + "GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386, + "Opus-4.6_cat_f1": 0.922684387023173, + "Opus-4.6_spec_f1": 0.8833694419146193, + "Opus-4.6_cat_mcc": 0.909266938399113, + "Opus-4.6_spec_qwk": 0.9227008860372746, + "Opus-4.6_spec_mae": 0.13583333333333333, + "Opus-4.6_cat_ece": 0.06551479384303091, + "Opus-4.6_spec_ece": 0.08248284702499709, + "Opus-4.6_spec_f1_L1Generi": 0.9242053789731052, + "Opus-4.6_spec_f1_L2Domain": 0.7788778877887789, + "Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786, + "Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143 + }, + { + "variant": "fp16", + "description": "Float16 encoder + heads", + "encoder_mb": 789.563648, + "ms_per_sample": 5.539002780715236, + "throughput_per_s": 180.53791261517884, + "peak_vram_mb": 1740.83837890625, + "build_s": 0.46582157304510474, + "GPT-5.4_cat_f1": 0.9336741161693523, + "GPT-5.4_spec_f1": 0.8951731906425856, + "GPT-5.4_cat_mcc": 0.9226990724708704, + "GPT-5.4_spec_qwk": 0.9324447137231142, + "GPT-5.4_spec_mae": 0.1175, + "GPT-5.4_cat_ece": 0.053747650533914546, + "GPT-5.4_spec_ece": 0.07004868157207966, + "GPT-5.4_spec_f1_L1Generi": 0.9354838709677419, + "GPT-5.4_spec_f1_L2Domain": 0.7975460122699386, + "GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236, + "GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386, + "Opus-4.6_cat_f1": 0.922684387023173, + "Opus-4.6_spec_f1": 0.8833694419146193, + "Opus-4.6_cat_mcc": 0.909266938399113, + "Opus-4.6_spec_qwk": 0.9227008860372746, + "Opus-4.6_spec_mae": 0.13583333333333333, + "Opus-4.6_cat_ece": 0.06541431720058125, + "Opus-4.6_spec_ece": 0.0816012116521597, + "Opus-4.6_spec_f1_L1Generi": 0.9242053789731052, + "Opus-4.6_spec_f1_L2Domain": 0.7788778877887789, + "Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786, + "Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143 + }, + { + "variant": "torchao-int8-wo", + "description": "torchao Int8 weight-only on encoder linears", + "encoder_mb": 789.563648, + "ms_per_sample": 6.078403938445263, + "throughput_per_s": 164.5168715549004, + "peak_vram_mb": 1416.36376953125, + "build_s": 0.5027359619853087, + "GPT-5.4_cat_f1": 0.9344870894825886, + "GPT-5.4_spec_f1": 0.8941203230194683, + "GPT-5.4_cat_mcc": 0.9237006314618685, + "GPT-5.4_spec_qwk": 0.9329693660903852, + "GPT-5.4_spec_mae": 0.1175, + "GPT-5.4_cat_ece": 0.05415941931307314, + "GPT-5.4_spec_ece": 0.06980206420024232, + "GPT-5.4_spec_f1_L1Generi": 0.9353796445880452, + "GPT-5.4_spec_f1_L2Domain": 0.793939393939394, + "GPT-5.4_spec_f1_L3Firm-S": 0.8936170212765957, + "GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386, + "Opus-4.6_cat_f1": 0.9234810481200378, + "Opus-4.6_spec_f1": 0.8814731397444973, + "Opus-4.6_cat_mcc": 0.9102750101817324, + "Opus-4.6_spec_qwk": 0.9207708779443254, + "Opus-4.6_spec_mae": 0.13916666666666666, + "Opus-4.6_cat_ece": 0.0641141641388337, + "Opus-4.6_spec_ece": 0.08370273689428968, + "Opus-4.6_spec_f1_L1Generi": 0.9208163265306123, + "Opus-4.6_spec_f1_L2Domain": 0.7752442996742671, + "Opus-4.6_spec_f1_L3Firm-S": 0.865546218487395, + "Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143 + }, + { + "variant": "torchao-int8-dyn", + "description": "torchao Int8 dynamic activation + Int8 weight on encoder", + "encoder_mb": 789.563648, + "ms_per_sample": 9.671733896636093, + "throughput_per_s": 103.39407707937539, + "peak_vram_mb": 1774.27392578125, + "build_s": 0.4831273259478621, + "GPT-5.4_cat_f1": 0.9336475878058536, + "GPT-5.4_spec_f1": 0.8918479759675974, + "GPT-5.4_cat_mcc": 0.9226968780743573, + "GPT-5.4_spec_qwk": 0.931514217618119, + "GPT-5.4_spec_mae": 0.12, + "GPT-5.4_cat_ece": 0.05363284418980283, + "GPT-5.4_spec_ece": 0.07049367701013878, + "GPT-5.4_spec_f1_L1Generi": 0.934412955465587, + "GPT-5.4_spec_f1_L2Domain": 0.7889908256880734, + "GPT-5.4_spec_f1_L3Firm-S": 0.8904428904428905, + "GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386, + "Opus-4.6_cat_f1": 0.9242573204255528, + "Opus-4.6_spec_f1": 0.8827245859621925, + "Opus-4.6_cat_mcc": 0.9112549308356716, + "Opus-4.6_spec_qwk": 0.92235918049198, + "Opus-4.6_spec_mae": 0.13666666666666666, + "Opus-4.6_cat_ece": 0.06330573419729862, + "Opus-4.6_spec_ece": 0.08290670409798626, + "Opus-4.6_spec_f1_L1Generi": 0.9230769230769231, + "Opus-4.6_spec_f1_L2Domain": 0.7763157894736842, + "Opus-4.6_spec_f1_L3Firm-S": 0.8672199170124482, + "Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143 + }, + { + "variant": "torchao-int4-wo", + "description": "torchao Int4 weight-only (group=128) on encoder", + "error": "ImportError: Requires mslk >= 1.0.0" + }, + { + "variant": "bnb-int8", + "description": "bitsandbytes LLM.int8 on encoder linears", + "encoder_mb": 789.563648, + "ms_per_sample": 7.762363941583317, + "throughput_per_s": 128.82673468103667, + "peak_vram_mb": 2135.203125, + "build_s": 1.1878160500200465, + "GPT-5.4_cat_f1": 0.9360988760303737, + "GPT-5.4_spec_f1": 0.8986323186392307, + "GPT-5.4_cat_mcc": 0.9256911778959798, + "GPT-5.4_spec_qwk": 0.9307948020550015, + "GPT-5.4_spec_mae": 0.1175, + "GPT-5.4_cat_ece": 0.052939765204985965, + "GPT-5.4_spec_ece": 0.06740866973996164, + "GPT-5.4_spec_f1_L1Generi": 0.9364440868865648, + "GPT-5.4_spec_f1_L2Domain": 0.8099688473520249, + "GPT-5.4_spec_f1_L3Firm-S": 0.892018779342723, + "GPT-5.4_spec_f1_L4Quanti": 0.9560975609756097, + "Opus-4.6_cat_f1": 0.9235105849558979, + "Opus-4.6_spec_f1": 0.8826923642825633, + "Opus-4.6_cat_mcc": 0.9103198007176273, + "Opus-4.6_spec_qwk": 0.9198415117342273, + "Opus-4.6_spec_mae": 0.13916666666666666, + "Opus-4.6_cat_ece": 0.06465620135267579, + "Opus-4.6_spec_ece": 0.083350846717755, + "Opus-4.6_spec_f1_L1Generi": 0.9235772357723577, + "Opus-4.6_spec_f1_L2Domain": 0.785234899328859, + "Opus-4.6_spec_f1_L3Firm-S": 0.860125260960334, + "Opus-4.6_spec_f1_L4Quanti": 0.9618320610687023 + }, + { + "variant": "bnb-nf4", + "description": "bitsandbytes NF4 4-bit (double-quant, bf16 compute)", + "encoder_mb": 274.843904, + "ms_per_sample": 5.860076693982895, + "throughput_per_s": 170.64623079537446, + "peak_vram_mb": 1287.34326171875, + "build_s": 0.4858604749897495, + "GPT-5.4_cat_f1": 0.3536909012886116, + "GPT-5.4_spec_f1": 0.22049451330952025, + "GPT-5.4_cat_mcc": 0.42471542150657926, + "GPT-5.4_spec_qwk": 0.24233251808742773, + "GPT-5.4_spec_mae": 0.8733333333333333, + "GPT-5.4_cat_ece": 0.09734637491405013, + "GPT-5.4_spec_ece": 0.43363295723994577, + "GPT-5.4_spec_f1_L1Generi": 0.35013262599469497, + "GPT-5.4_spec_f1_L2Domain": 0.28546861564918313, + "GPT-5.4_spec_f1_L3Firm-S": 0.2463768115942029, + "GPT-5.4_spec_f1_L4Quanti": 0.0, + "Opus-4.6_cat_f1": 0.35763512449392704, + "Opus-4.6_spec_f1": 0.20754679251319788, + "Opus-4.6_cat_mcc": 0.42166882753874363, + "Opus-4.6_spec_qwk": 0.24096533359991634, + "Opus-4.6_spec_mae": 0.88, + "Opus-4.6_cat_ece": 0.09567970824738346, + "Opus-4.6_spec_ece": 0.4489923599362374, + "Opus-4.6_spec_f1_L1Generi": 0.3508771929824561, + "Opus-4.6_spec_f1_L2Domain": 0.2543859649122807, + "Opus-4.6_spec_f1_L3Firm-S": 0.22492401215805471, + "Opus-4.6_spec_f1_L4Quanti": 0.0 + }, + { + "variant": "bnb-nf4-nodq", + "description": "bitsandbytes NF4 4-bit (no double-quant)", + "encoder_mb": 274.843904, + "ms_per_sample": 5.861402786540566, + "throughput_per_s": 170.607623536175, + "peak_vram_mb": 1287.34326171875, + "build_s": 0.4908116469741799, + "GPT-5.4_cat_f1": 0.3536909012886116, + "GPT-5.4_spec_f1": 0.22049451330952025, + "GPT-5.4_cat_mcc": 0.42471542150657926, + "GPT-5.4_spec_qwk": 0.24233251808742773, + "GPT-5.4_spec_mae": 0.8733333333333333, + "GPT-5.4_cat_ece": 0.09734637491405013, + "GPT-5.4_spec_ece": 0.43363295723994577, + "GPT-5.4_spec_f1_L1Generi": 0.35013262599469497, + "GPT-5.4_spec_f1_L2Domain": 0.28546861564918313, + "GPT-5.4_spec_f1_L3Firm-S": 0.2463768115942029, + "GPT-5.4_spec_f1_L4Quanti": 0.0, + "Opus-4.6_cat_f1": 0.35763512449392704, + "Opus-4.6_spec_f1": 0.20754679251319788, + "Opus-4.6_cat_mcc": 0.42166882753874363, + "Opus-4.6_spec_qwk": 0.24096533359991634, + "Opus-4.6_spec_mae": 0.88, + "Opus-4.6_cat_ece": 0.09567970824738346, + "Opus-4.6_spec_ece": 0.4489923599362374, + "Opus-4.6_spec_f1_L1Generi": 0.3508771929824561, + "Opus-4.6_spec_f1_L2Domain": 0.2543859649122807, + "Opus-4.6_spec_f1_L3Firm-S": 0.22492401215805471, + "Opus-4.6_spec_f1_L4Quanti": 0.0 + }, + { + "variant": "bnb-fp4", + "description": "bitsandbytes FP4 4-bit (no double-quant)", + "encoder_mb": 274.843904, + "ms_per_sample": 5.865302347471394, + "throughput_per_s": 170.49419463109393, + "peak_vram_mb": 1287.34326171875, + "build_s": 0.4887635139748454, + "GPT-5.4_cat_f1": 0.16293893512410998, + "GPT-5.4_spec_f1": 0.20854117827130608, + "GPT-5.4_cat_mcc": 0.22345796853389935, + "GPT-5.4_spec_qwk": 0.2326064604575444, + "GPT-5.4_spec_mae": 0.8825, + "GPT-5.4_cat_ece": 0.2080524676044782, + "GPT-5.4_spec_ece": 0.44289420386155437, + "GPT-5.4_spec_f1_L1Generi": 0.35742444152431013, + "GPT-5.4_spec_f1_L2Domain": 0.2824956672443674, + "GPT-5.4_spec_f1_L3Firm-S": 0.19424460431654678, + "GPT-5.4_spec_f1_L4Quanti": 0.0, + "Opus-4.6_cat_f1": 0.16861118726256397, + "Opus-4.6_spec_f1": 0.19783939283519508, + "Opus-4.6_cat_mcc": 0.2251562222131823, + "Opus-4.6_spec_qwk": 0.22580295138888895, + "Opus-4.6_spec_mae": 0.8925, + "Opus-4.6_cat_ece": 0.19888580093781152, + "Opus-4.6_spec_ece": 0.45814307530721027, + "Opus-4.6_spec_f1_L1Generi": 0.35294117647058826, + "Opus-4.6_spec_f1_L2Domain": 0.251105216622458, + "Opus-4.6_spec_f1_L3Firm-S": 0.18731117824773413, + "Opus-4.6_spec_f1_L4Quanti": 0.0 + } +] \ No newline at end of file diff --git a/results/eval/quant/torchao-int8-dyn/metrics.json b/results/eval/quant/torchao-int8-dyn/metrics.json new file mode 100644 index 0000000..e469295 --- /dev/null +++ b/results/eval/quant/torchao-int8-dyn/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.9336475878058536, + "cat_weighted_f1": 0.9342872402134198, + "cat_macro_precision": 0.9319024691959354, + "cat_macro_recall": 0.9376938259865566, + "cat_mcc": 0.9226968780743573, + "cat_auc": 0.9924054453050574, + "cat_ece": 0.05363284418980283, + "cat_confusion_matrix": [ + [ + 225, + 0, + 3, + 0, + 2, + 0, + 0 + ], + [ + 0, + 85, + 0, + 0, + 2, + 1, + 0 + ], + [ + 2, + 0, + 145, + 1, + 2, + 0, + 0 + ], + [ + 1, + 0, + 3, + 130, + 0, + 2, + 0 + ], + [ + 6, + 1, + 5, + 19, + 164, + 1, + 2 + ], + [ + 0, + 3, + 1, + 8, + 2, + 207, + 0 + ], + [ + 0, + 0, + 0, + 0, + 12, + 0, + 165 + ] + ], + "cat_f1_BoardGov": 0.9698275862068966, + "cat_prec_BoardGov": 0.9615384615384616, + "cat_recall_BoardGov": 0.9782608695652174, + "cat_f1_Incident": 0.96045197740113, + "cat_prec_Incident": 0.9550561797752809, + "cat_recall_Incident": 0.9659090909090909, + "cat_f1_Manageme": 0.9446254071661238, + "cat_prec_Manageme": 0.9235668789808917, + "cat_recall_Manageme": 0.9666666666666667, + "cat_f1_NoneOthe": 0.8843537414965986, + "cat_prec_NoneOthe": 0.8227848101265823, + "cat_recall_NoneOthe": 0.9558823529411765, + "cat_f1_RiskMana": 0.8586387434554974, + "cat_prec_RiskMana": 0.8913043478260869, + "cat_recall_RiskMana": 0.8282828282828283, + "cat_f1_Strategy": 0.9583333333333334, + "cat_prec_Strategy": 0.981042654028436, + "cat_recall_Strategy": 0.9366515837104072, + "cat_f1_Third-Pa": 0.9593023255813954, + "cat_prec_Third-Pa": 0.9880239520958084, + "cat_recall_Third-Pa": 0.9322033898305084, + "cat_kripp_alpha": 0.9223561935890119, + "spec_macro_f1": 0.8918479759675974, + "spec_weighted_f1": 0.9097693388297432, + "spec_macro_precision": 0.8930494570032042, + "spec_macro_recall": 0.8915621000757135, + "spec_mcc": 0.8628946887605918, + "spec_auc": 0.9807842405238503, + "spec_ece": 0.07049367701013878, + "spec_confusion_matrix": [ + [ + 577, + 24, + 14, + 3 + ], + [ + 29, + 129, + 8, + 2 + ], + [ + 9, + 5, + 191, + 2 + ], + [ + 2, + 1, + 9, + 195 + ] + ], + "spec_f1_L1Generi": 0.934412955465587, + "spec_prec_L1Generi": 0.9351701782820098, + "spec_recall_L1Generi": 0.9336569579288025, + "spec_f1_L2Domain": 0.7889908256880734, + "spec_prec_L2Domain": 0.8113207547169812, + "spec_recall_L2Domain": 0.7678571428571429, + "spec_f1_L3Firm-S": 0.8904428904428905, + "spec_prec_L3Firm-S": 0.8603603603603603, + "spec_recall_L3Firm-S": 0.9227053140096618, + "spec_f1_L4Quanti": 0.9535452322738386, + "spec_prec_L4Quanti": 0.9653465346534653, + "spec_recall_L4Quanti": 0.9420289855072463, + "spec_qwk": 0.931514217618119, + "spec_mae": 0.12, + "spec_kripp_alpha": 0.9169918680049234 + }, + "Opus-4.6": { + "cat_macro_f1": 0.9242573204255528, + "cat_weighted_f1": 0.9232556488517519, + "cat_macro_precision": 0.9193897229484191, + "cat_macro_recall": 0.9331778058838005, + "cat_mcc": 0.9112549308356716, + "cat_auc": 0.9941614030336741, + "cat_ece": 0.06330573419729862, + "cat_confusion_matrix": [ + [ + 212, + 0, + 1, + 0, + 1, + 0, + 0 + ], + [ + 0, + 78, + 0, + 0, + 1, + 0, + 0 + ], + [ + 8, + 0, + 145, + 1, + 3, + 0, + 1 + ], + [ + 0, + 0, + 1, + 138, + 1, + 1, + 0 + ], + [ + 13, + 0, + 9, + 14, + 169, + 1, + 7 + ], + [ + 1, + 11, + 1, + 4, + 3, + 208, + 0 + ], + [ + 0, + 0, + 0, + 1, + 6, + 1, + 159 + ] + ], + "cat_f1_BoardGov": 0.9464285714285714, + "cat_prec_BoardGov": 0.905982905982906, + "cat_recall_BoardGov": 0.9906542056074766, + "cat_f1_Incident": 0.9285714285714286, + "cat_prec_Incident": 0.8764044943820225, + "cat_recall_Incident": 0.9873417721518988, + "cat_f1_Manageme": 0.9206349206349206, + "cat_prec_Manageme": 0.9235668789808917, + "cat_recall_Manageme": 0.9177215189873418, + "cat_f1_NoneOthe": 0.9230769230769231, + "cat_prec_NoneOthe": 0.8734177215189873, + "cat_recall_NoneOthe": 0.9787234042553191, + "cat_f1_RiskMana": 0.8513853904282116, + "cat_prec_RiskMana": 0.9184782608695652, + "cat_recall_RiskMana": 0.7934272300469484, + "cat_f1_Strategy": 0.9476082004555809, + "cat_prec_Strategy": 0.985781990521327, + "cat_recall_Strategy": 0.9122807017543859, + "cat_f1_Third-Pa": 0.9520958083832335, + "cat_prec_Third-Pa": 0.9520958083832335, + "cat_recall_Third-Pa": 0.9520958083832335, + "cat_kripp_alpha": 0.9105393643352402, + "spec_macro_f1": 0.8827245859621925, + "spec_weighted_f1": 0.8997656600606208, + "spec_macro_precision": 0.8833309642003535, + "spec_macro_recall": 0.8861518760895928, + "spec_mcc": 0.8488976906438819, + "spec_auc": 0.9740582923879771, + "spec_ece": 0.08290670409798626, + "spec_confusion_matrix": [ + [ + 564, + 31, + 9, + 1 + ], + [ + 22, + 118, + 3, + 2 + ], + [ + 31, + 10, + 209, + 10 + ], + [ + 0, + 0, + 1, + 189 + ] + ], + "spec_f1_L1Generi": 0.9230769230769231, + "spec_prec_L1Generi": 0.9141004862236629, + "spec_recall_L1Generi": 0.9322314049586777, + "spec_f1_L2Domain": 0.7763157894736842, + "spec_prec_L2Domain": 0.7421383647798742, + "spec_recall_L2Domain": 0.8137931034482758, + "spec_f1_L3Firm-S": 0.8672199170124482, + "spec_prec_L3Firm-S": 0.9414414414414415, + "spec_recall_L3Firm-S": 0.8038461538461539, + "spec_f1_L4Quanti": 0.9642857142857143, + "spec_prec_L4Quanti": 0.9356435643564357, + "spec_recall_L4Quanti": 0.9947368421052631, + "spec_qwk": 0.92235918049198, + "spec_mae": 0.13666666666666666, + "spec_kripp_alpha": 0.9061330450504643 + }, + "_runtime": { + "encoder_mb": 789.563648, + "ms_per_sample": 9.671733896636093, + "throughput_per_s": 103.39407707937539, + "peak_vram_mb": 1774.27392578125, + "build_s": 0.4831273259478621 + } +} \ No newline at end of file diff --git a/results/eval/quant/torchao-int8-wo/metrics.json b/results/eval/quant/torchao-int8-wo/metrics.json new file mode 100644 index 0000000..dc68167 --- /dev/null +++ b/results/eval/quant/torchao-int8-wo/metrics.json @@ -0,0 +1,297 @@ +{ + "GPT-5.4": { + "cat_macro_f1": 0.9344870894825886, + "cat_weighted_f1": 0.9351173265780133, + "cat_macro_precision": 0.9326512314038842, + "cat_macro_recall": 0.9387442461546238, + "cat_mcc": 0.9237006314618685, + "cat_auc": 0.992309699625497, + "cat_ece": 0.05415941931307314, + "cat_confusion_matrix": [ + [ + 225, + 0, + 3, + 0, + 2, + 0, + 0 + ], + [ + 0, + 85, + 0, + 0, + 2, + 1, + 0 + ], + [ + 2, + 0, + 145, + 1, + 2, + 0, + 0 + ], + [ + 0, + 0, + 3, + 131, + 0, + 2, + 0 + ], + [ + 6, + 1, + 5, + 19, + 164, + 1, + 2 + ], + [ + 0, + 3, + 1, + 8, + 2, + 207, + 0 + ], + [ + 0, + 0, + 0, + 0, + 12, + 0, + 165 + ] + ], + "cat_f1_BoardGov": 0.9719222462203023, + "cat_prec_BoardGov": 0.9656652360515021, + "cat_recall_BoardGov": 0.9782608695652174, + "cat_f1_Incident": 0.96045197740113, + "cat_prec_Incident": 0.9550561797752809, + "cat_recall_Incident": 0.9659090909090909, + "cat_f1_Manageme": 0.9446254071661238, + "cat_prec_Manageme": 0.9235668789808917, + "cat_recall_Manageme": 0.9666666666666667, + "cat_f1_NoneOthe": 0.888135593220339, + "cat_prec_NoneOthe": 0.8238993710691824, + "cat_recall_NoneOthe": 0.9632352941176471, + "cat_f1_RiskMana": 0.8586387434554974, + "cat_prec_RiskMana": 0.8913043478260869, + "cat_recall_RiskMana": 0.8282828282828283, + "cat_f1_Strategy": 0.9583333333333334, + "cat_prec_Strategy": 0.981042654028436, + "cat_recall_Strategy": 0.9366515837104072, + "cat_f1_Third-Pa": 0.9593023255813954, + "cat_prec_Third-Pa": 0.9880239520958084, + "cat_recall_Third-Pa": 0.9322033898305084, + "cat_kripp_alpha": 0.9233443339647499, + "spec_macro_f1": 0.8941203230194683, + "spec_weighted_f1": 0.9115075208518084, + "spec_macro_precision": 0.8957148694260108, + "spec_macro_recall": 0.892931893103379, + "spec_mcc": 0.8651929532300995, + "spec_auc": 0.981624069084201, + "spec_ece": 0.06980206420024232, + "spec_confusion_matrix": [ + [ + 579, + 24, + 12, + 3 + ], + [ + 29, + 131, + 6, + 2 + ], + [ + 10, + 6, + 189, + 2 + ], + [ + 2, + 1, + 9, + 195 + ] + ], + "spec_f1_L1Generi": 0.9353796445880452, + "spec_prec_L1Generi": 0.9338709677419355, + "spec_recall_L1Generi": 0.9368932038834952, + "spec_f1_L2Domain": 0.793939393939394, + "spec_prec_L2Domain": 0.808641975308642, + "spec_recall_L2Domain": 0.7797619047619048, + "spec_f1_L3Firm-S": 0.8936170212765957, + "spec_prec_L3Firm-S": 0.875, + "spec_recall_L3Firm-S": 0.9130434782608695, + "spec_f1_L4Quanti": 0.9535452322738386, + "spec_prec_L4Quanti": 0.9653465346534653, + "spec_recall_L4Quanti": 0.9420289855072463, + "spec_qwk": 0.9329693660903852, + "spec_mae": 0.1175, + "spec_kripp_alpha": 0.9181842655510584 + }, + "Opus-4.6": { + "cat_macro_f1": 0.9234810481200378, + "cat_weighted_f1": 0.9224737817442137, + "cat_macro_precision": 0.9185473372257941, + "cat_macro_recall": 0.9325102491414775, + "cat_mcc": 0.9102750101817324, + "cat_auc": 0.9940184741579791, + "cat_ece": 0.0641141641388337, + "cat_confusion_matrix": [ + [ + 211, + 0, + 1, + 1, + 1, + 0, + 0 + ], + [ + 0, + 78, + 0, + 0, + 1, + 0, + 0 + ], + [ + 8, + 0, + 145, + 1, + 3, + 0, + 1 + ], + [ + 0, + 0, + 1, + 138, + 1, + 1, + 0 + ], + [ + 13, + 0, + 9, + 14, + 169, + 1, + 7 + ], + [ + 1, + 11, + 1, + 4, + 3, + 208, + 0 + ], + [ + 0, + 0, + 0, + 1, + 6, + 1, + 159 + ] + ], + "cat_f1_BoardGov": 0.9440715883668904, + "cat_prec_BoardGov": 0.9055793991416309, + "cat_recall_BoardGov": 0.985981308411215, + "cat_f1_Incident": 0.9285714285714286, + "cat_prec_Incident": 0.8764044943820225, + "cat_recall_Incident": 0.9873417721518988, + "cat_f1_Manageme": 0.9206349206349206, + "cat_prec_Manageme": 0.9235668789808917, + "cat_recall_Manageme": 0.9177215189873418, + "cat_f1_NoneOthe": 0.92, + "cat_prec_NoneOthe": 0.8679245283018868, + "cat_recall_NoneOthe": 0.9787234042553191, + "cat_f1_RiskMana": 0.8513853904282116, + "cat_prec_RiskMana": 0.9184782608695652, + "cat_recall_RiskMana": 0.7934272300469484, + "cat_f1_Strategy": 0.9476082004555809, + "cat_prec_Strategy": 0.985781990521327, + "cat_recall_Strategy": 0.9122807017543859, + "cat_f1_Third-Pa": 0.9520958083832335, + "cat_prec_Third-Pa": 0.9520958083832335, + "cat_recall_Third-Pa": 0.9520958083832335, + "cat_kripp_alpha": 0.9095617653952504, + "spec_macro_f1": 0.8814731397444973, + "spec_weighted_f1": 0.8981338362706646, + "spec_macro_precision": 0.8833981471623865, + "spec_macro_recall": 0.8849913986360116, + "spec_mcc": 0.8465512998506631, + "spec_auc": 0.9729999946345258, + "spec_ece": 0.08370273689428968, + "spec_confusion_matrix": [ + [ + 564, + 33, + 7, + 1 + ], + [ + 22, + 119, + 2, + 2 + ], + [ + 34, + 10, + 206, + 10 + ], + [ + 0, + 0, + 1, + 189 + ] + ], + "spec_f1_L1Generi": 0.9208163265306123, + "spec_prec_L1Generi": 0.9096774193548387, + "spec_recall_L1Generi": 0.9322314049586777, + "spec_f1_L2Domain": 0.7752442996742671, + "spec_prec_L2Domain": 0.7345679012345679, + "spec_recall_L2Domain": 0.8206896551724138, + "spec_f1_L3Firm-S": 0.865546218487395, + "spec_prec_L3Firm-S": 0.9537037037037037, + "spec_recall_L3Firm-S": 0.7923076923076923, + "spec_f1_L4Quanti": 0.9642857142857143, + "spec_prec_L4Quanti": 0.9356435643564357, + "spec_recall_L4Quanti": 0.9947368421052631, + "spec_qwk": 0.9207708779443254, + "spec_mae": 0.13916666666666666, + "spec_kripp_alpha": 0.9033268512180281 + }, + "_runtime": { + "encoder_mb": 789.563648, + "ms_per_sample": 6.078403938445263, + "throughput_per_s": 164.5168715549004, + "peak_vram_mb": 1416.36376953125, + "build_s": 0.5027359619853087 + } +} \ No newline at end of file