Joey Eamigh 67beaede45
quantization + onnx sweeps
Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).

Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.

Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
2026-04-07 05:10:38 -04:00

297 lines
6.5 KiB
JSON

{
"GPT-5.4": {
"cat_macro_f1": 0.9336475878058536,
"cat_weighted_f1": 0.9342872402134198,
"cat_macro_precision": 0.9319024691959354,
"cat_macro_recall": 0.9376938259865566,
"cat_mcc": 0.9226968780743573,
"cat_auc": 0.9924054453050574,
"cat_ece": 0.05363284418980283,
"cat_confusion_matrix": [
[
225,
0,
3,
0,
2,
0,
0
],
[
0,
85,
0,
0,
2,
1,
0
],
[
2,
0,
145,
1,
2,
0,
0
],
[
1,
0,
3,
130,
0,
2,
0
],
[
6,
1,
5,
19,
164,
1,
2
],
[
0,
3,
1,
8,
2,
207,
0
],
[
0,
0,
0,
0,
12,
0,
165
]
],
"cat_f1_BoardGov": 0.9698275862068966,
"cat_prec_BoardGov": 0.9615384615384616,
"cat_recall_BoardGov": 0.9782608695652174,
"cat_f1_Incident": 0.96045197740113,
"cat_prec_Incident": 0.9550561797752809,
"cat_recall_Incident": 0.9659090909090909,
"cat_f1_Manageme": 0.9446254071661238,
"cat_prec_Manageme": 0.9235668789808917,
"cat_recall_Manageme": 0.9666666666666667,
"cat_f1_NoneOthe": 0.8843537414965986,
"cat_prec_NoneOthe": 0.8227848101265823,
"cat_recall_NoneOthe": 0.9558823529411765,
"cat_f1_RiskMana": 0.8586387434554974,
"cat_prec_RiskMana": 0.8913043478260869,
"cat_recall_RiskMana": 0.8282828282828283,
"cat_f1_Strategy": 0.9583333333333334,
"cat_prec_Strategy": 0.981042654028436,
"cat_recall_Strategy": 0.9366515837104072,
"cat_f1_Third-Pa": 0.9593023255813954,
"cat_prec_Third-Pa": 0.9880239520958084,
"cat_recall_Third-Pa": 0.9322033898305084,
"cat_kripp_alpha": 0.9223561935890119,
"spec_macro_f1": 0.8918479759675974,
"spec_weighted_f1": 0.9097693388297432,
"spec_macro_precision": 0.8930494570032042,
"spec_macro_recall": 0.8915621000757135,
"spec_mcc": 0.8628946887605918,
"spec_auc": 0.9807842405238503,
"spec_ece": 0.07049367701013878,
"spec_confusion_matrix": [
[
577,
24,
14,
3
],
[
29,
129,
8,
2
],
[
9,
5,
191,
2
],
[
2,
1,
9,
195
]
],
"spec_f1_L1Generi": 0.934412955465587,
"spec_prec_L1Generi": 0.9351701782820098,
"spec_recall_L1Generi": 0.9336569579288025,
"spec_f1_L2Domain": 0.7889908256880734,
"spec_prec_L2Domain": 0.8113207547169812,
"spec_recall_L2Domain": 0.7678571428571429,
"spec_f1_L3Firm-S": 0.8904428904428905,
"spec_prec_L3Firm-S": 0.8603603603603603,
"spec_recall_L3Firm-S": 0.9227053140096618,
"spec_f1_L4Quanti": 0.9535452322738386,
"spec_prec_L4Quanti": 0.9653465346534653,
"spec_recall_L4Quanti": 0.9420289855072463,
"spec_qwk": 0.931514217618119,
"spec_mae": 0.12,
"spec_kripp_alpha": 0.9169918680049234
},
"Opus-4.6": {
"cat_macro_f1": 0.9242573204255528,
"cat_weighted_f1": 0.9232556488517519,
"cat_macro_precision": 0.9193897229484191,
"cat_macro_recall": 0.9331778058838005,
"cat_mcc": 0.9112549308356716,
"cat_auc": 0.9941614030336741,
"cat_ece": 0.06330573419729862,
"cat_confusion_matrix": [
[
212,
0,
1,
0,
1,
0,
0
],
[
0,
78,
0,
0,
1,
0,
0
],
[
8,
0,
145,
1,
3,
0,
1
],
[
0,
0,
1,
138,
1,
1,
0
],
[
13,
0,
9,
14,
169,
1,
7
],
[
1,
11,
1,
4,
3,
208,
0
],
[
0,
0,
0,
1,
6,
1,
159
]
],
"cat_f1_BoardGov": 0.9464285714285714,
"cat_prec_BoardGov": 0.905982905982906,
"cat_recall_BoardGov": 0.9906542056074766,
"cat_f1_Incident": 0.9285714285714286,
"cat_prec_Incident": 0.8764044943820225,
"cat_recall_Incident": 0.9873417721518988,
"cat_f1_Manageme": 0.9206349206349206,
"cat_prec_Manageme": 0.9235668789808917,
"cat_recall_Manageme": 0.9177215189873418,
"cat_f1_NoneOthe": 0.9230769230769231,
"cat_prec_NoneOthe": 0.8734177215189873,
"cat_recall_NoneOthe": 0.9787234042553191,
"cat_f1_RiskMana": 0.8513853904282116,
"cat_prec_RiskMana": 0.9184782608695652,
"cat_recall_RiskMana": 0.7934272300469484,
"cat_f1_Strategy": 0.9476082004555809,
"cat_prec_Strategy": 0.985781990521327,
"cat_recall_Strategy": 0.9122807017543859,
"cat_f1_Third-Pa": 0.9520958083832335,
"cat_prec_Third-Pa": 0.9520958083832335,
"cat_recall_Third-Pa": 0.9520958083832335,
"cat_kripp_alpha": 0.9105393643352402,
"spec_macro_f1": 0.8827245859621925,
"spec_weighted_f1": 0.8997656600606208,
"spec_macro_precision": 0.8833309642003535,
"spec_macro_recall": 0.8861518760895928,
"spec_mcc": 0.8488976906438819,
"spec_auc": 0.9740582923879771,
"spec_ece": 0.08290670409798626,
"spec_confusion_matrix": [
[
564,
31,
9,
1
],
[
22,
118,
3,
2
],
[
31,
10,
209,
10
],
[
0,
0,
1,
189
]
],
"spec_f1_L1Generi": 0.9230769230769231,
"spec_prec_L1Generi": 0.9141004862236629,
"spec_recall_L1Generi": 0.9322314049586777,
"spec_f1_L2Domain": 0.7763157894736842,
"spec_prec_L2Domain": 0.7421383647798742,
"spec_recall_L2Domain": 0.8137931034482758,
"spec_f1_L3Firm-S": 0.8672199170124482,
"spec_prec_L3Firm-S": 0.9414414414414415,
"spec_recall_L3Firm-S": 0.8038461538461539,
"spec_f1_L4Quanti": 0.9642857142857143,
"spec_prec_L4Quanti": 0.9356435643564357,
"spec_recall_L4Quanti": 0.9947368421052631,
"spec_qwk": 0.92235918049198,
"spec_mae": 0.13666666666666666,
"spec_kripp_alpha": 0.9061330450504643
},
"_runtime": {
"encoder_mb": 789.563648,
"ms_per_sample": 9.671733896636093,
"throughput_per_s": 103.39407707937539,
"peak_vram_mb": 1774.27392578125,
"build_s": 0.4831273259478621
}
}