Joey Eamigh 67beaede45
quantization + onnx sweeps
Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).

Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.

Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
2026-04-07 05:10:38 -04:00

286 lines
12 KiB
JSON

[
{
"variant": "fp32",
"description": "Float32 encoder + heads",
"encoder_mb": 1579.127296,
"ms_per_sample": 16.293709366727853,
"throughput_per_s": 61.37337898281309,
"peak_vram_mb": 3503.53369140625,
"build_s": 0.6251941699883901,
"GPT-5.4_cat_f1": 0.9336741161693523,
"GPT-5.4_spec_f1": 0.8943486525770918,
"GPT-5.4_cat_mcc": 0.9226990724708704,
"GPT-5.4_spec_qwk": 0.9321211092744079,
"GPT-5.4_spec_mae": 0.11833333333333333,
"GPT-5.4_cat_ece": 0.05388230005900064,
"GPT-5.4_spec_ece": 0.07088303024570146,
"GPT-5.4_spec_f1_L1Generi": 0.9346246973365617,
"GPT-5.4_spec_f1_L2Domain": 0.7951070336391437,
"GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236,
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
"Opus-4.6_cat_f1": 0.922684387023173,
"Opus-4.6_spec_f1": 0.8825095464914274,
"Opus-4.6_cat_mcc": 0.909266938399113,
"Opus-4.6_spec_qwk": 0.9223702541559166,
"Opus-4.6_spec_mae": 0.13666666666666666,
"Opus-4.6_cat_ece": 0.06541596949100496,
"Opus-4.6_spec_ece": 0.08238246644536655,
"Opus-4.6_spec_f1_L1Generi": 0.9233278955954323,
"Opus-4.6_spec_f1_L2Domain": 0.7763157894736842,
"Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786,
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
},
{
"variant": "bf16",
"description": "BFloat16 baseline (matches eval pipeline)",
"encoder_mb": 789.563648,
"ms_per_sample": 5.516677870764397,
"throughput_per_s": 181.26851402716375,
"peak_vram_mb": 1740.83837890625,
"build_s": 0.48778308398323134,
"GPT-5.4_cat_f1": 0.9336741161693523,
"GPT-5.4_spec_f1": 0.8951731906425856,
"GPT-5.4_cat_mcc": 0.9226990724708704,
"GPT-5.4_spec_qwk": 0.9324447137231142,
"GPT-5.4_spec_mae": 0.1175,
"GPT-5.4_cat_ece": 0.053848127176364245,
"GPT-5.4_spec_ece": 0.07135417198141418,
"GPT-5.4_spec_f1_L1Generi": 0.9354838709677419,
"GPT-5.4_spec_f1_L2Domain": 0.7975460122699386,
"GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236,
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
"Opus-4.6_cat_f1": 0.922684387023173,
"Opus-4.6_spec_f1": 0.8833694419146193,
"Opus-4.6_cat_mcc": 0.909266938399113,
"Opus-4.6_spec_qwk": 0.9227008860372746,
"Opus-4.6_spec_mae": 0.13583333333333333,
"Opus-4.6_cat_ece": 0.06551479384303091,
"Opus-4.6_spec_ece": 0.08248284702499709,
"Opus-4.6_spec_f1_L1Generi": 0.9242053789731052,
"Opus-4.6_spec_f1_L2Domain": 0.7788778877887789,
"Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786,
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
},
{
"variant": "fp16",
"description": "Float16 encoder + heads",
"encoder_mb": 789.563648,
"ms_per_sample": 5.539002780715236,
"throughput_per_s": 180.53791261517884,
"peak_vram_mb": 1740.83837890625,
"build_s": 0.46582157304510474,
"GPT-5.4_cat_f1": 0.9336741161693523,
"GPT-5.4_spec_f1": 0.8951731906425856,
"GPT-5.4_cat_mcc": 0.9226990724708704,
"GPT-5.4_spec_qwk": 0.9324447137231142,
"GPT-5.4_spec_mae": 0.1175,
"GPT-5.4_cat_ece": 0.053747650533914546,
"GPT-5.4_spec_ece": 0.07004868157207966,
"GPT-5.4_spec_f1_L1Generi": 0.9354838709677419,
"GPT-5.4_spec_f1_L2Domain": 0.7975460122699386,
"GPT-5.4_spec_f1_L3Firm-S": 0.8941176470588236,
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
"Opus-4.6_cat_f1": 0.922684387023173,
"Opus-4.6_spec_f1": 0.8833694419146193,
"Opus-4.6_cat_mcc": 0.909266938399113,
"Opus-4.6_spec_qwk": 0.9227008860372746,
"Opus-4.6_spec_mae": 0.13583333333333333,
"Opus-4.6_cat_ece": 0.06541431720058125,
"Opus-4.6_spec_ece": 0.0816012116521597,
"Opus-4.6_spec_f1_L1Generi": 0.9242053789731052,
"Opus-4.6_spec_f1_L2Domain": 0.7788778877887789,
"Opus-4.6_spec_f1_L3Firm-S": 0.8661087866108786,
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
},
{
"variant": "torchao-int8-wo",
"description": "torchao Int8 weight-only on encoder linears",
"encoder_mb": 789.563648,
"ms_per_sample": 6.078403938445263,
"throughput_per_s": 164.5168715549004,
"peak_vram_mb": 1416.36376953125,
"build_s": 0.5027359619853087,
"GPT-5.4_cat_f1": 0.9344870894825886,
"GPT-5.4_spec_f1": 0.8941203230194683,
"GPT-5.4_cat_mcc": 0.9237006314618685,
"GPT-5.4_spec_qwk": 0.9329693660903852,
"GPT-5.4_spec_mae": 0.1175,
"GPT-5.4_cat_ece": 0.05415941931307314,
"GPT-5.4_spec_ece": 0.06980206420024232,
"GPT-5.4_spec_f1_L1Generi": 0.9353796445880452,
"GPT-5.4_spec_f1_L2Domain": 0.793939393939394,
"GPT-5.4_spec_f1_L3Firm-S": 0.8936170212765957,
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
"Opus-4.6_cat_f1": 0.9234810481200378,
"Opus-4.6_spec_f1": 0.8814731397444973,
"Opus-4.6_cat_mcc": 0.9102750101817324,
"Opus-4.6_spec_qwk": 0.9207708779443254,
"Opus-4.6_spec_mae": 0.13916666666666666,
"Opus-4.6_cat_ece": 0.0641141641388337,
"Opus-4.6_spec_ece": 0.08370273689428968,
"Opus-4.6_spec_f1_L1Generi": 0.9208163265306123,
"Opus-4.6_spec_f1_L2Domain": 0.7752442996742671,
"Opus-4.6_spec_f1_L3Firm-S": 0.865546218487395,
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
},
{
"variant": "torchao-int8-dyn",
"description": "torchao Int8 dynamic activation + Int8 weight on encoder",
"encoder_mb": 789.563648,
"ms_per_sample": 9.671733896636093,
"throughput_per_s": 103.39407707937539,
"peak_vram_mb": 1774.27392578125,
"build_s": 0.4831273259478621,
"GPT-5.4_cat_f1": 0.9336475878058536,
"GPT-5.4_spec_f1": 0.8918479759675974,
"GPT-5.4_cat_mcc": 0.9226968780743573,
"GPT-5.4_spec_qwk": 0.931514217618119,
"GPT-5.4_spec_mae": 0.12,
"GPT-5.4_cat_ece": 0.05363284418980283,
"GPT-5.4_spec_ece": 0.07049367701013878,
"GPT-5.4_spec_f1_L1Generi": 0.934412955465587,
"GPT-5.4_spec_f1_L2Domain": 0.7889908256880734,
"GPT-5.4_spec_f1_L3Firm-S": 0.8904428904428905,
"GPT-5.4_spec_f1_L4Quanti": 0.9535452322738386,
"Opus-4.6_cat_f1": 0.9242573204255528,
"Opus-4.6_spec_f1": 0.8827245859621925,
"Opus-4.6_cat_mcc": 0.9112549308356716,
"Opus-4.6_spec_qwk": 0.92235918049198,
"Opus-4.6_spec_mae": 0.13666666666666666,
"Opus-4.6_cat_ece": 0.06330573419729862,
"Opus-4.6_spec_ece": 0.08290670409798626,
"Opus-4.6_spec_f1_L1Generi": 0.9230769230769231,
"Opus-4.6_spec_f1_L2Domain": 0.7763157894736842,
"Opus-4.6_spec_f1_L3Firm-S": 0.8672199170124482,
"Opus-4.6_spec_f1_L4Quanti": 0.9642857142857143
},
{
"variant": "torchao-int4-wo",
"description": "torchao Int4 weight-only (group=128) on encoder",
"error": "ImportError: Requires mslk >= 1.0.0"
},
{
"variant": "bnb-int8",
"description": "bitsandbytes LLM.int8 on encoder linears",
"encoder_mb": 789.563648,
"ms_per_sample": 7.762363941583317,
"throughput_per_s": 128.82673468103667,
"peak_vram_mb": 2135.203125,
"build_s": 1.1878160500200465,
"GPT-5.4_cat_f1": 0.9360988760303737,
"GPT-5.4_spec_f1": 0.8986323186392307,
"GPT-5.4_cat_mcc": 0.9256911778959798,
"GPT-5.4_spec_qwk": 0.9307948020550015,
"GPT-5.4_spec_mae": 0.1175,
"GPT-5.4_cat_ece": 0.052939765204985965,
"GPT-5.4_spec_ece": 0.06740866973996164,
"GPT-5.4_spec_f1_L1Generi": 0.9364440868865648,
"GPT-5.4_spec_f1_L2Domain": 0.8099688473520249,
"GPT-5.4_spec_f1_L3Firm-S": 0.892018779342723,
"GPT-5.4_spec_f1_L4Quanti": 0.9560975609756097,
"Opus-4.6_cat_f1": 0.9235105849558979,
"Opus-4.6_spec_f1": 0.8826923642825633,
"Opus-4.6_cat_mcc": 0.9103198007176273,
"Opus-4.6_spec_qwk": 0.9198415117342273,
"Opus-4.6_spec_mae": 0.13916666666666666,
"Opus-4.6_cat_ece": 0.06465620135267579,
"Opus-4.6_spec_ece": 0.083350846717755,
"Opus-4.6_spec_f1_L1Generi": 0.9235772357723577,
"Opus-4.6_spec_f1_L2Domain": 0.785234899328859,
"Opus-4.6_spec_f1_L3Firm-S": 0.860125260960334,
"Opus-4.6_spec_f1_L4Quanti": 0.9618320610687023
},
{
"variant": "bnb-nf4",
"description": "bitsandbytes NF4 4-bit (double-quant, bf16 compute)",
"encoder_mb": 274.843904,
"ms_per_sample": 5.860076693982895,
"throughput_per_s": 170.64623079537446,
"peak_vram_mb": 1287.34326171875,
"build_s": 0.4858604749897495,
"GPT-5.4_cat_f1": 0.3536909012886116,
"GPT-5.4_spec_f1": 0.22049451330952025,
"GPT-5.4_cat_mcc": 0.42471542150657926,
"GPT-5.4_spec_qwk": 0.24233251808742773,
"GPT-5.4_spec_mae": 0.8733333333333333,
"GPT-5.4_cat_ece": 0.09734637491405013,
"GPT-5.4_spec_ece": 0.43363295723994577,
"GPT-5.4_spec_f1_L1Generi": 0.35013262599469497,
"GPT-5.4_spec_f1_L2Domain": 0.28546861564918313,
"GPT-5.4_spec_f1_L3Firm-S": 0.2463768115942029,
"GPT-5.4_spec_f1_L4Quanti": 0.0,
"Opus-4.6_cat_f1": 0.35763512449392704,
"Opus-4.6_spec_f1": 0.20754679251319788,
"Opus-4.6_cat_mcc": 0.42166882753874363,
"Opus-4.6_spec_qwk": 0.24096533359991634,
"Opus-4.6_spec_mae": 0.88,
"Opus-4.6_cat_ece": 0.09567970824738346,
"Opus-4.6_spec_ece": 0.4489923599362374,
"Opus-4.6_spec_f1_L1Generi": 0.3508771929824561,
"Opus-4.6_spec_f1_L2Domain": 0.2543859649122807,
"Opus-4.6_spec_f1_L3Firm-S": 0.22492401215805471,
"Opus-4.6_spec_f1_L4Quanti": 0.0
},
{
"variant": "bnb-nf4-nodq",
"description": "bitsandbytes NF4 4-bit (no double-quant)",
"encoder_mb": 274.843904,
"ms_per_sample": 5.861402786540566,
"throughput_per_s": 170.607623536175,
"peak_vram_mb": 1287.34326171875,
"build_s": 0.4908116469741799,
"GPT-5.4_cat_f1": 0.3536909012886116,
"GPT-5.4_spec_f1": 0.22049451330952025,
"GPT-5.4_cat_mcc": 0.42471542150657926,
"GPT-5.4_spec_qwk": 0.24233251808742773,
"GPT-5.4_spec_mae": 0.8733333333333333,
"GPT-5.4_cat_ece": 0.09734637491405013,
"GPT-5.4_spec_ece": 0.43363295723994577,
"GPT-5.4_spec_f1_L1Generi": 0.35013262599469497,
"GPT-5.4_spec_f1_L2Domain": 0.28546861564918313,
"GPT-5.4_spec_f1_L3Firm-S": 0.2463768115942029,
"GPT-5.4_spec_f1_L4Quanti": 0.0,
"Opus-4.6_cat_f1": 0.35763512449392704,
"Opus-4.6_spec_f1": 0.20754679251319788,
"Opus-4.6_cat_mcc": 0.42166882753874363,
"Opus-4.6_spec_qwk": 0.24096533359991634,
"Opus-4.6_spec_mae": 0.88,
"Opus-4.6_cat_ece": 0.09567970824738346,
"Opus-4.6_spec_ece": 0.4489923599362374,
"Opus-4.6_spec_f1_L1Generi": 0.3508771929824561,
"Opus-4.6_spec_f1_L2Domain": 0.2543859649122807,
"Opus-4.6_spec_f1_L3Firm-S": 0.22492401215805471,
"Opus-4.6_spec_f1_L4Quanti": 0.0
},
{
"variant": "bnb-fp4",
"description": "bitsandbytes FP4 4-bit (no double-quant)",
"encoder_mb": 274.843904,
"ms_per_sample": 5.865302347471394,
"throughput_per_s": 170.49419463109393,
"peak_vram_mb": 1287.34326171875,
"build_s": 0.4887635139748454,
"GPT-5.4_cat_f1": 0.16293893512410998,
"GPT-5.4_spec_f1": 0.20854117827130608,
"GPT-5.4_cat_mcc": 0.22345796853389935,
"GPT-5.4_spec_qwk": 0.2326064604575444,
"GPT-5.4_spec_mae": 0.8825,
"GPT-5.4_cat_ece": 0.2080524676044782,
"GPT-5.4_spec_ece": 0.44289420386155437,
"GPT-5.4_spec_f1_L1Generi": 0.35742444152431013,
"GPT-5.4_spec_f1_L2Domain": 0.2824956672443674,
"GPT-5.4_spec_f1_L3Firm-S": 0.19424460431654678,
"GPT-5.4_spec_f1_L4Quanti": 0.0,
"Opus-4.6_cat_f1": 0.16861118726256397,
"Opus-4.6_spec_f1": 0.19783939283519508,
"Opus-4.6_cat_mcc": 0.2251562222131823,
"Opus-4.6_spec_qwk": 0.22580295138888895,
"Opus-4.6_spec_mae": 0.8925,
"Opus-4.6_cat_ece": 0.19888580093781152,
"Opus-4.6_spec_ece": 0.45814307530721027,
"Opus-4.6_spec_f1_L1Generi": 0.35294117647058826,
"Opus-4.6_spec_f1_L2Domain": 0.251105216622458,
"Opus-4.6_spec_f1_L3Firm-S": 0.18731117824773413,
"Opus-4.6_spec_f1_L4Quanti": 0.0
}
]