Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
50 lines
1.9 KiB
JSON
50 lines
1.9 KiB
JSON
[
|
|
{
|
|
"variant": "onnx-fp32",
|
|
"model_mb": 1583.256294,
|
|
"ms_per_sample": 12.703279327494482,
|
|
"throughput_per_s": 78.71983085781946,
|
|
"peak_vram_mb": 8228.0,
|
|
"load_vram_mb": 2074.0,
|
|
"GPT-5.4_cat_f1": 0.9336741161693523,
|
|
"GPT-5.4_spec_f1": 0.8951731906425856,
|
|
"GPT-5.4_cat_mcc": 0.9226990724708704,
|
|
"GPT-5.4_spec_qwk": 0.9324447137231142,
|
|
"GPT-5.4_spec_mae": 0.1175,
|
|
"GPT-5.4_cat_ece": 0.05386760701735813,
|
|
"GPT-5.4_spec_ece": 0.07004604930679002,
|
|
"Opus-4.6_cat_f1": 0.922684387023173,
|
|
"Opus-4.6_spec_f1": 0.8833694419146193,
|
|
"Opus-4.6_cat_mcc": 0.909266938399113,
|
|
"Opus-4.6_spec_qwk": 0.9227008860372746,
|
|
"Opus-4.6_spec_mae": 0.13583333333333333,
|
|
"Opus-4.6_cat_ece": 0.06540583113829297,
|
|
"Opus-4.6_spec_ece": 0.08156729981303217
|
|
},
|
|
{
|
|
"variant": "onnx-fp16",
|
|
"error": "Fail: [ONNXRuntimeError] : 1 : FAIL : Load model from /home/joey/Documents/sec-cyBERT/results/eval/onnx/models/model_fp16.onnx failed:Type Error: Type parameter (T) of Optype (Mul) bound to different types (tensor(float) and tensor(float16) in node (/model/backbone/rotary_emb_1/Mul_2)."
|
|
},
|
|
{
|
|
"variant": "onnx-int8-dyn",
|
|
"model_mb": 553.381903,
|
|
"ms_per_sample": 95.90791940659983,
|
|
"throughput_per_s": 10.42666764316426,
|
|
"peak_vram_mb": 7188.0,
|
|
"load_vram_mb": 266.0,
|
|
"GPT-5.4_cat_f1": 0.3971686880679718,
|
|
"GPT-5.4_spec_f1": 0.3364003775746365,
|
|
"GPT-5.4_cat_mcc": 0.3459776856134484,
|
|
"GPT-5.4_spec_qwk": 0.4412945592628398,
|
|
"GPT-5.4_spec_mae": 0.7225,
|
|
"GPT-5.4_cat_ece": 0.23434762333830195,
|
|
"GPT-5.4_spec_ece": 0.35458642202119034,
|
|
"Opus-4.6_cat_f1": 0.4150626036637055,
|
|
"Opus-4.6_spec_f1": 0.3204015536108683,
|
|
"Opus-4.6_cat_mcc": 0.3663561834842673,
|
|
"Opus-4.6_spec_qwk": 0.43979676755288855,
|
|
"Opus-4.6_spec_mae": 0.7375,
|
|
"Opus-4.6_cat_ece": 0.2126809566716353,
|
|
"Opus-4.6_spec_ece": 0.37541975535452365
|
|
}
|
|
] |