Joey Eamigh 67beaede45
quantization + onnx sweeps
Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).

Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.

Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
2026-04-07 05:10:38 -04:00

297 lines
6.4 KiB
JSON

{
"GPT-5.4": {
"cat_macro_f1": 0.9360988760303737,
"cat_weighted_f1": 0.9367630863906107,
"cat_macro_precision": 0.934342558672944,
"cat_macro_recall": 0.9404157843351134,
"cat_mcc": 0.9256911778959798,
"cat_auc": 0.9918112947607864,
"cat_ece": 0.052939765204985965,
"cat_confusion_matrix": [
[
226,
0,
2,
0,
2,
0,
0
],
[
0,
85,
0,
0,
2,
1,
0
],
[
2,
0,
145,
1,
2,
0,
0
],
[
0,
0,
3,
132,
0,
1,
0
],
[
6,
1,
5,
19,
164,
1,
2
],
[
0,
3,
1,
8,
2,
207,
0
],
[
0,
0,
0,
0,
12,
0,
165
]
],
"cat_f1_BoardGov": 0.9741379310344828,
"cat_prec_BoardGov": 0.9658119658119658,
"cat_recall_BoardGov": 0.9826086956521739,
"cat_f1_Incident": 0.96045197740113,
"cat_prec_Incident": 0.9550561797752809,
"cat_recall_Incident": 0.9659090909090909,
"cat_f1_Manageme": 0.9477124183006536,
"cat_prec_Manageme": 0.9294871794871795,
"cat_recall_Manageme": 0.9666666666666667,
"cat_f1_NoneOthe": 0.8918918918918919,
"cat_prec_NoneOthe": 0.825,
"cat_recall_NoneOthe": 0.9705882352941176,
"cat_f1_RiskMana": 0.8586387434554974,
"cat_prec_RiskMana": 0.8913043478260869,
"cat_recall_RiskMana": 0.8282828282828283,
"cat_f1_Strategy": 0.9605568445475638,
"cat_prec_Strategy": 0.9857142857142858,
"cat_recall_Strategy": 0.9366515837104072,
"cat_f1_Third-Pa": 0.9593023255813954,
"cat_prec_Third-Pa": 0.9880239520958084,
"cat_recall_Third-Pa": 0.9322033898305084,
"cat_kripp_alpha": 0.9253092213149172,
"spec_macro_f1": 0.8986323186392307,
"spec_weighted_f1": 0.9144644120807768,
"spec_macro_precision": 0.9034925881673722,
"spec_macro_recall": 0.8950728490354916,
"spec_mcc": 0.870090391628814,
"spec_auc": 0.98134918835569,
"spec_ece": 0.06740866973996164,
"spec_confusion_matrix": [
[
582,
19,
14,
3
],
[
29,
130,
7,
2
],
[
12,
3,
190,
2
],
[
2,
1,
8,
196
]
],
"spec_f1_L1Generi": 0.9364440868865648,
"spec_prec_L1Generi": 0.9312,
"spec_recall_L1Generi": 0.941747572815534,
"spec_f1_L2Domain": 0.8099688473520249,
"spec_prec_L2Domain": 0.8496732026143791,
"spec_recall_L2Domain": 0.7738095238095238,
"spec_f1_L3Firm-S": 0.892018779342723,
"spec_prec_L3Firm-S": 0.867579908675799,
"spec_recall_L3Firm-S": 0.9178743961352657,
"spec_f1_L4Quanti": 0.9560975609756097,
"spec_prec_L4Quanti": 0.9655172413793104,
"spec_recall_L4Quanti": 0.9468599033816425,
"spec_qwk": 0.9307948020550015,
"spec_mae": 0.1175,
"spec_kripp_alpha": 0.9166492249745117
},
"Opus-4.6": {
"cat_macro_f1": 0.9235105849558979,
"cat_weighted_f1": 0.9224780370334836,
"cat_macro_precision": 0.9187130112710481,
"cat_macro_recall": 0.9326192612354074,
"cat_mcc": 0.9103198007176273,
"cat_auc": 0.9937246318315877,
"cat_ece": 0.06465620135267579,
"cat_confusion_matrix": [
[
211,
0,
1,
1,
1,
0,
0
],
[
0,
78,
0,
0,
1,
0,
0
],
[
9,
0,
144,
1,
3,
0,
1
],
[
0,
0,
1,
139,
1,
0,
0
],
[
13,
0,
9,
14,
169,
1,
7
],
[
1,
11,
1,
4,
3,
208,
0
],
[
0,
0,
0,
1,
6,
1,
159
]
],
"cat_f1_BoardGov": 0.9419642857142857,
"cat_prec_BoardGov": 0.9017094017094017,
"cat_recall_BoardGov": 0.985981308411215,
"cat_f1_Incident": 0.9285714285714286,
"cat_prec_Incident": 0.8764044943820225,
"cat_recall_Incident": 0.9873417721518988,
"cat_f1_Manageme": 0.9171974522292994,
"cat_prec_Manageme": 0.9230769230769231,
"cat_recall_Manageme": 0.9113924050632911,
"cat_f1_NoneOthe": 0.9235880398671097,
"cat_prec_NoneOthe": 0.86875,
"cat_recall_NoneOthe": 0.9858156028368794,
"cat_f1_RiskMana": 0.8513853904282116,
"cat_prec_RiskMana": 0.9184782608695652,
"cat_recall_RiskMana": 0.7934272300469484,
"cat_f1_Strategy": 0.9497716894977168,
"cat_prec_Strategy": 0.9904761904761905,
"cat_recall_Strategy": 0.9122807017543859,
"cat_f1_Third-Pa": 0.9520958083832335,
"cat_prec_Third-Pa": 0.9520958083832335,
"cat_recall_Third-Pa": 0.9520958083832335,
"cat_kripp_alpha": 0.9095619506866199,
"spec_macro_f1": 0.8826923642825633,
"spec_weighted_f1": 0.8991699562480843,
"spec_macro_precision": 0.8862949086294886,
"spec_macro_recall": 0.8831960153359262,
"spec_mcc": 0.8485449936701916,
"spec_auc": 0.9725823165743999,
"spec_ece": 0.083350846717755,
"spec_confusion_matrix": [
[
568,
27,
9,
1
],
[
23,
117,
3,
2
],
[
34,
9,
206,
11
],
[
0,
0,
1,
189
]
],
"spec_f1_L1Generi": 0.9235772357723577,
"spec_prec_L1Generi": 0.9088,
"spec_recall_L1Generi": 0.9388429752066115,
"spec_f1_L2Domain": 0.785234899328859,
"spec_prec_L2Domain": 0.7647058823529411,
"spec_recall_L2Domain": 0.8068965517241379,
"spec_f1_L3Firm-S": 0.860125260960334,
"spec_prec_L3Firm-S": 0.9406392694063926,
"spec_recall_L3Firm-S": 0.7923076923076923,
"spec_f1_L4Quanti": 0.9618320610687023,
"spec_prec_L4Quanti": 0.9310344827586207,
"spec_recall_L4Quanti": 0.9947368421052631,
"spec_qwk": 0.9198415117342273,
"spec_mae": 0.13916666666666666,
"spec_kripp_alpha": 0.9038906079654127
},
"_runtime": {
"encoder_mb": 789.563648,
"ms_per_sample": 7.762363941583317,
"throughput_per_s": 128.82673468103667,
"peak_vram_mb": 2135.203125,
"build_s": 1.1878160500200465
}
}