Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
286 lines
12 KiB
JSON
286 lines
12 KiB
JSON
[
|
|
{
|
|
"variant": "fp32",
|
|
"description": "Float32 encoder + heads",
|
|
"encoder_mb": 1579.127296,
|
|
"ms_per_sample": 16.293709366727853,
|
|
"throughput_per_s": 61.37337898281309,
|
|
"peak_vram_mb": 3503.53369140625,
|
|
"build_s": 0.6251941699883901,
|
|
"GPT-5.4_cat_f1": 0.9336741161693523,
|
|
"GPT-5.4_spec_f1": 0.8943486525770918,
|
|
"GPT-5.4_cat_mcc": 0.9226990724708704,
|
|
"GPT-5.4_spec_qwk": 0.9321211092744079,
|
|
"GPT-5.4_spec_mae": 0.11833333333333333,
|
|
"GPT-5.4_cat_ece": 0.05388230005900064,
|
|
"GPT-5.4_spec_ece": 0.07088303024570146,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.9346246973365617,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.7951070336391437,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
|
|
"Opus-4.6_cat_f1": 0.922684387023173,
|
|
"Opus-4.6_spec_f1": 0.8825095464914274,
|
|
"Opus-4.6_cat_mcc": 0.909266938399113,
|
|
"Opus-4.6_spec_qwk": 0.9223702541559166,
|
|
"Opus-4.6_spec_mae": 0.13666666666666666,
|
|
"Opus-4.6_cat_ece": 0.06541596949100496,
|
|
"Opus-4.6_spec_ece": 0.08238246644536655,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.9233278955954323,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.7763157894736842,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
|
|
},
|
|
{
|
|
"variant": "bf16",
|
|
"description": "BFloat16 baseline (matches eval pipeline)",
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 5.516677870764397,
|
|
"throughput_per_s": 181.26851402716375,
|
|
"peak_vram_mb": 1740.83837890625,
|
|
"build_s": 0.48778308398323134,
|
|
"GPT-5.4_cat_f1": 0.9336741161693523,
|
|
"GPT-5.4_spec_f1": 0.8951731906425856,
|
|
"GPT-5.4_cat_mcc": 0.9226990724708704,
|
|
"GPT-5.4_spec_qwk": 0.9324447137231142,
|
|
"GPT-5.4_spec_mae": 0.1175,
|
|
"GPT-5.4_cat_ece": 0.053848127176364245,
|
|
"GPT-5.4_spec_ece": 0.07135417198141418,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.9354838709677419,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.7975460122699386,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
|
|
"Opus-4.6_cat_f1": 0.922684387023173,
|
|
"Opus-4.6_spec_f1": 0.8833694419146193,
|
|
"Opus-4.6_cat_mcc": 0.909266938399113,
|
|
"Opus-4.6_spec_qwk": 0.9227008860372746,
|
|
"Opus-4.6_spec_mae": 0.13583333333333333,
|
|
"Opus-4.6_cat_ece": 0.06551479384303091,
|
|
"Opus-4.6_spec_ece": 0.08248284702499709,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.9242053789731052,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.7788778877887789,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
|
|
},
|
|
{
|
|
"variant": "fp16",
|
|
"description": "Float16 encoder + heads",
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 5.539002780715236,
|
|
"throughput_per_s": 180.53791261517884,
|
|
"peak_vram_mb": 1740.83837890625,
|
|
"build_s": 0.46582157304510474,
|
|
"GPT-5.4_cat_f1": 0.9336741161693523,
|
|
"GPT-5.4_spec_f1": 0.8951731906425856,
|
|
"GPT-5.4_cat_mcc": 0.9226990724708704,
|
|
"GPT-5.4_spec_qwk": 0.9324447137231142,
|
|
"GPT-5.4_spec_mae": 0.1175,
|
|
"GPT-5.4_cat_ece": 0.053747650533914546,
|
|
"GPT-5.4_spec_ece": 0.07004868157207966,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.9354838709677419,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.7975460122699386,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
|
|
"Opus-4.6_cat_f1": 0.922684387023173,
|
|
"Opus-4.6_spec_f1": 0.8833694419146193,
|
|
"Opus-4.6_cat_mcc": 0.909266938399113,
|
|
"Opus-4.6_spec_qwk": 0.9227008860372746,
|
|
"Opus-4.6_spec_mae": 0.13583333333333333,
|
|
"Opus-4.6_cat_ece": 0.06541431720058125,
|
|
"Opus-4.6_spec_ece": 0.0816012116521597,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.9242053789731052,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.7788778877887789,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
|
|
},
|
|
{
|
|
"variant": "torchao-int8-wo",
|
|
"description": "torchao Int8 weight-only on encoder linears",
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 6.078403938445263,
|
|
"throughput_per_s": 164.5168715549004,
|
|
"peak_vram_mb": 1416.36376953125,
|
|
"build_s": 0.5027359619853087,
|
|
"GPT-5.4_cat_f1": 0.9344870894825886,
|
|
"GPT-5.4_spec_f1": 0.8941203230194683,
|
|
"GPT-5.4_cat_mcc": 0.9237006314618685,
|
|
"GPT-5.4_spec_qwk": 0.9329693660903852,
|
|
"GPT-5.4_spec_mae": 0.1175,
|
|
"GPT-5.4_cat_ece": 0.05415941931307314,
|
|
"GPT-5.4_spec_ece": 0.06980206420024232,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.9353796445880452,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.793939393939394,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.8936170212765957,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
|
|
"Opus-4.6_cat_f1": 0.9234810481200378,
|
|
"Opus-4.6_spec_f1": 0.8814731397444973,
|
|
"Opus-4.6_cat_mcc": 0.9102750101817324,
|
|
"Opus-4.6_spec_qwk": 0.9207708779443254,
|
|
"Opus-4.6_spec_mae": 0.13916666666666666,
|
|
"Opus-4.6_cat_ece": 0.0641141641388337,
|
|
"Opus-4.6_spec_ece": 0.08370273689428968,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.9208163265306123,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.7752442996742671,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.865546218487395,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
|
|
},
|
|
{
|
|
"variant": "torchao-int8-dyn",
|
|
"description": "torchao Int8 dynamic activation + Int8 weight on encoder",
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 9.671733896636093,
|
|
"throughput_per_s": 103.39407707937539,
|
|
"peak_vram_mb": 1774.27392578125,
|
|
"build_s": 0.4831273259478621,
|
|
"GPT-5.4_cat_f1": 0.9336475878058536,
|
|
"GPT-5.4_spec_f1": 0.8918479759675974,
|
|
"GPT-5.4_cat_mcc": 0.9226968780743573,
|
|
"GPT-5.4_spec_qwk": 0.931514217618119,
|
|
"GPT-5.4_spec_mae": 0.12,
|
|
"GPT-5.4_cat_ece": 0.05363284418980283,
|
|
"GPT-5.4_spec_ece": 0.07049367701013878,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.934412955465587,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.7889908256880734,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.8904428904428905,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
|
|
"Opus-4.6_cat_f1": 0.9242573204255528,
|
|
"Opus-4.6_spec_f1": 0.8827245859621925,
|
|
"Opus-4.6_cat_mcc": 0.9112549308356716,
|
|
"Opus-4.6_spec_qwk": 0.92235918049198,
|
|
"Opus-4.6_spec_mae": 0.13666666666666666,
|
|
"Opus-4.6_cat_ece": 0.06330573419729862,
|
|
"Opus-4.6_spec_ece": 0.08290670409798626,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.9230769230769231,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.7763157894736842,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.8672199170124482,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
|
|
},
|
|
{
|
|
"variant": "torchao-int4-wo",
|
|
"description": "torchao Int4 weight-only (group=128) on encoder",
|
|
"error": "ImportError: Requires mslk >= 1.0.0"
|
|
},
|
|
{
|
|
"variant": "bnb-int8",
|
|
"description": "bitsandbytes LLM.int8 on encoder linears",
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 7.762363941583317,
|
|
"throughput_per_s": 128.82673468103667,
|
|
"peak_vram_mb": 2135.203125,
|
|
"build_s": 1.1878160500200465,
|
|
"GPT-5.4_cat_f1": 0.9360988760303737,
|
|
"GPT-5.4_spec_f1": 0.8986323186392307,
|
|
"GPT-5.4_cat_mcc": 0.9256911778959798,
|
|
"GPT-5.4_spec_qwk": 0.9307948020550015,
|
|
"GPT-5.4_spec_mae": 0.1175,
|
|
"GPT-5.4_cat_ece": 0.052939765204985965,
|
|
"GPT-5.4_spec_ece": 0.06740866973996164,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.9364440868865648,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.8099688473520249,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.892018779342723,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.9560975609756097,
|
|
"Opus-4.6_cat_f1": 0.9235105849558979,
|
|
"Opus-4.6_spec_f1": 0.8826923642825633,
|
|
"Opus-4.6_cat_mcc": 0.9103198007176273,
|
|
"Opus-4.6_spec_qwk": 0.9198415117342273,
|
|
"Opus-4.6_spec_mae": 0.13916666666666666,
|
|
"Opus-4.6_cat_ece": 0.06465620135267579,
|
|
"Opus-4.6_spec_ece": 0.083350846717755,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.9235772357723577,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.785234899328859,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.860125260960334,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.9618320610687023
|
|
},
|
|
{
|
|
"variant": "bnb-nf4",
|
|
"description": "bitsandbytes NF4 4-bit (double-quant, bf16 compute)",
|
|
"encoder_mb": 274.843904,
|
|
"ms_per_sample": 5.860076693982895,
|
|
"throughput_per_s": 170.64623079537446,
|
|
"peak_vram_mb": 1287.34326171875,
|
|
"build_s": 0.4858604749897495,
|
|
"GPT-5.4_cat_f1": 0.3536909012886116,
|
|
"GPT-5.4_spec_f1": 0.22049451330952025,
|
|
"GPT-5.4_cat_mcc": 0.42471542150657926,
|
|
"GPT-5.4_spec_qwk": 0.24233251808742773,
|
|
"GPT-5.4_spec_mae": 0.8733333333333333,
|
|
"GPT-5.4_cat_ece": 0.09734637491405013,
|
|
"GPT-5.4_spec_ece": 0.43363295723994577,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.35013262599469497,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.28546861564918313,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.2463768115942029,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.0,
|
|
"Opus-4.6_cat_f1": 0.35763512449392704,
|
|
"Opus-4.6_spec_f1": 0.20754679251319788,
|
|
"Opus-4.6_cat_mcc": 0.42166882753874363,
|
|
"Opus-4.6_spec_qwk": 0.24096533359991634,
|
|
"Opus-4.6_spec_mae": 0.88,
|
|
"Opus-4.6_cat_ece": 0.09567970824738346,
|
|
"Opus-4.6_spec_ece": 0.4489923599362374,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.3508771929824561,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.2543859649122807,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.22492401215805471,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.0
|
|
},
|
|
{
|
|
"variant": "bnb-nf4-nodq",
|
|
"description": "bitsandbytes NF4 4-bit (no double-quant)",
|
|
"encoder_mb": 274.843904,
|
|
"ms_per_sample": 5.861402786540566,
|
|
"throughput_per_s": 170.607623536175,
|
|
"peak_vram_mb": 1287.34326171875,
|
|
"build_s": 0.4908116469741799,
|
|
"GPT-5.4_cat_f1": 0.3536909012886116,
|
|
"GPT-5.4_spec_f1": 0.22049451330952025,
|
|
"GPT-5.4_cat_mcc": 0.42471542150657926,
|
|
"GPT-5.4_spec_qwk": 0.24233251808742773,
|
|
"GPT-5.4_spec_mae": 0.8733333333333333,
|
|
"GPT-5.4_cat_ece": 0.09734637491405013,
|
|
"GPT-5.4_spec_ece": 0.43363295723994577,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.35013262599469497,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.28546861564918313,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.2463768115942029,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.0,
|
|
"Opus-4.6_cat_f1": 0.35763512449392704,
|
|
"Opus-4.6_spec_f1": 0.20754679251319788,
|
|
"Opus-4.6_cat_mcc": 0.42166882753874363,
|
|
"Opus-4.6_spec_qwk": 0.24096533359991634,
|
|
"Opus-4.6_spec_mae": 0.88,
|
|
"Opus-4.6_cat_ece": 0.09567970824738346,
|
|
"Opus-4.6_spec_ece": 0.4489923599362374,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.3508771929824561,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.2543859649122807,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.22492401215805471,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.0
|
|
},
|
|
{
|
|
"variant": "bnb-fp4",
|
|
"description": "bitsandbytes FP4 4-bit (no double-quant)",
|
|
"encoder_mb": 274.843904,
|
|
"ms_per_sample": 5.865302347471394,
|
|
"throughput_per_s": 170.49419463109393,
|
|
"peak_vram_mb": 1287.34326171875,
|
|
"build_s": 0.4887635139748454,
|
|
"GPT-5.4_cat_f1": 0.16293893512410998,
|
|
"GPT-5.4_spec_f1": 0.20854117827130608,
|
|
"GPT-5.4_cat_mcc": 0.22345796853389935,
|
|
"GPT-5.4_spec_qwk": 0.2326064604575444,
|
|
"GPT-5.4_spec_mae": 0.8825,
|
|
"GPT-5.4_cat_ece": 0.2080524676044782,
|
|
"GPT-5.4_spec_ece": 0.44289420386155437,
|
|
"GPT-5.4_spec_f1_L1Generi": 0.35742444152431013,
|
|
"GPT-5.4_spec_f1_L2Domain": 0.2824956672443674,
|
|
"GPT-5.4_spec_f1_L3Firm-S": 0.19424460431654678,
|
|
"GPT-5.4_spec_f1_L4Quanti": 0.0,
|
|
"Opus-4.6_cat_f1": 0.16861118726256397,
|
|
"Opus-4.6_spec_f1": 0.19783939283519508,
|
|
"Opus-4.6_cat_mcc": 0.2251562222131823,
|
|
"Opus-4.6_spec_qwk": 0.22580295138888895,
|
|
"Opus-4.6_spec_mae": 0.8925,
|
|
"Opus-4.6_cat_ece": 0.19888580093781152,
|
|
"Opus-4.6_spec_ece": 0.45814307530721027,
|
|
"Opus-4.6_spec_f1_L1Generi": 0.35294117647058826,
|
|
"Opus-4.6_spec_f1_L2Domain": 0.251105216622458,
|
|
"Opus-4.6_spec_f1_L3Firm-S": 0.18731117824773413,
|
|
"Opus-4.6_spec_f1_L4Quanti": 0.0
|
|
}
|
|
] |