Joey Eamigh 67beaede45
quantization + onnx sweeps
Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).

Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.

Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
2026-04-07 05:10:38 -04:00

297 lines
6.3 KiB
JSON

{
"GPT-5.4": {
"cat_macro_f1": 0.16293893512410998,
"cat_weighted_f1": 0.1746727986514593,
"cat_macro_precision": 0.6289222195093943,
"cat_macro_recall": 0.23220413662370398,
"cat_mcc": 0.22345796853389935,
"cat_auc": 0.8960306312891495,
"cat_ece": 0.2080524676044782,
"cat_confusion_matrix": [
[
8,
0,
0,
0,
221,
0,
1
],
[
0,
0,
0,
1,
82,
0,
5
],
[
0,
0,
4,
0,
145,
0,
1
],
[
0,
0,
2,
3,
128,
0,
3
],
[
0,
0,
0,
0,
195,
0,
3
],
[
0,
0,
0,
0,
208,
2,
11
],
[
0,
0,
0,
0,
80,
0,
97
]
],
"cat_f1_BoardGov": 0.06722689075630252,
"cat_prec_BoardGov": 1.0,
"cat_recall_BoardGov": 0.034782608695652174,
"cat_f1_Incident": 0.0,
"cat_prec_Incident": 0.0,
"cat_recall_Incident": 0.0,
"cat_f1_Manageme": 0.05128205128205128,
"cat_prec_Manageme": 0.6666666666666666,
"cat_recall_Manageme": 0.02666666666666667,
"cat_f1_NoneOthe": 0.04285714285714286,
"cat_prec_NoneOthe": 0.75,
"cat_recall_NoneOthe": 0.022058823529411766,
"cat_f1_RiskMana": 0.31026252983293556,
"cat_prec_RiskMana": 0.18413597733711048,
"cat_recall_RiskMana": 0.9848484848484849,
"cat_f1_Strategy": 0.017937219730941704,
"cat_prec_Strategy": 1.0,
"cat_recall_Strategy": 0.00904977375565611,
"cat_f1_Third-Pa": 0.6510067114093959,
"cat_prec_Third-Pa": 0.8016528925619835,
"cat_recall_Third-Pa": 0.5480225988700564,
"cat_kripp_alpha": -0.08693512028952255,
"spec_macro_f1": 0.20854117827130608,
"spec_weighted_f1": 0.2571301750438355,
"spec_macro_precision": 0.3741612607031285,
"spec_macro_recall": 0.33018440069147115,
"spec_mcc": 0.1895317453505129,
"spec_auc": 0.8110497500610155,
"spec_ece": 0.44289420386155437,
"spec_confusion_matrix": [
[
136,
473,
9,
0
],
[
4,
163,
1,
0
],
[
1,
179,
27,
0
],
[
2,
171,
34,
0
]
],
"spec_f1_L1Generi": 0.35742444152431013,
"spec_prec_L1Generi": 0.951048951048951,
"spec_recall_L1Generi": 0.22006472491909385,
"spec_f1_L2Domain": 0.2824956672443674,
"spec_prec_L2Domain": 0.16531440162271804,
"spec_recall_L2Domain": 0.9702380952380952,
"spec_f1_L3Firm-S": 0.19424460431654678,
"spec_prec_L3Firm-S": 0.38028169014084506,
"spec_recall_L3Firm-S": 0.13043478260869565,
"spec_f1_L4Quanti": 0.0,
"spec_prec_L4Quanti": 0.0,
"spec_recall_L4Quanti": 0.0,
"spec_qwk": 0.2326064604575444,
"spec_mae": 0.8825,
"spec_kripp_alpha": 0.26499611744119067
},
"Opus-4.6": {
"cat_macro_f1": 0.16861118726256397,
"cat_weighted_f1": 0.1792365613004711,
"cat_macro_precision": 0.6306758954840335,
"cat_macro_recall": 0.2357303291121537,
"cat_mcc": 0.2251562222131823,
"cat_auc": 0.8995073249291591,
"cat_ece": 0.19888580093781152,
"cat_confusion_matrix": [
[
8,
0,
0,
0,
205,
0,
1
],
[
0,
0,
0,
1,
73,
0,
5
],
[
0,
0,
4,
0,
154,
0,
0
],
[
0,
0,
2,
3,
133,
0,
3
],
[
0,
0,
0,
0,
208,
0,
5
],
[
0,
0,
0,
0,
216,
2,
10
],
[
0,
0,
0,
0,
70,
0,
97
]
],
"cat_f1_BoardGov": 0.07207207207207207,
"cat_prec_BoardGov": 1.0,
"cat_recall_BoardGov": 0.037383177570093455,
"cat_f1_Incident": 0.0,
"cat_prec_Incident": 0.0,
"cat_recall_Incident": 0.0,
"cat_f1_Manageme": 0.04878048780487805,
"cat_prec_Manageme": 0.6666666666666666,
"cat_recall_Manageme": 0.02531645569620253,
"cat_f1_NoneOthe": 0.041379310344827586,
"cat_prec_NoneOthe": 0.75,
"cat_recall_NoneOthe": 0.02127659574468085,
"cat_f1_RiskMana": 0.3270440251572327,
"cat_prec_RiskMana": 0.1964117091595845,
"cat_recall_RiskMana": 0.9765258215962441,
"cat_f1_Strategy": 0.017391304347826087,
"cat_prec_Strategy": 1.0,
"cat_recall_Strategy": 0.008771929824561403,
"cat_f1_Third-Pa": 0.6736111111111112,
"cat_prec_Third-Pa": 0.8016528925619835,
"cat_recall_Third-Pa": 0.5808383233532934,
"cat_kripp_alpha": -0.07941064783948448,
"spec_macro_f1": 0.19783939283519508,
"spec_weighted_f1": 0.24886714543281097,
"spec_macro_precision": 0.37592821714182745,
"spec_macro_recall": 0.3291807330600434,
"spec_mcc": 0.18219176358380398,
"spec_auc": 0.790090253498083,
"spec_ece": 0.45814307530721027,
"spec_confusion_matrix": [
[
132,
466,
7,
0
],
[
1,
142,
2,
0
],
[
8,
221,
31,
0
],
[
2,
157,
31,
0
]
],
"spec_f1_L1Generi": 0.35294117647058826,
"spec_prec_L1Generi": 0.9230769230769231,
"spec_recall_L1Generi": 0.21818181818181817,
"spec_f1_L2Domain": 0.251105216622458,
"spec_prec_L2Domain": 0.1440162271805274,
"spec_recall_L2Domain": 0.9793103448275862,
"spec_f1_L3Firm-S": 0.18731117824773413,
"spec_prec_L3Firm-S": 0.43661971830985913,
"spec_recall_L3Firm-S": 0.11923076923076924,
"spec_f1_L4Quanti": 0.0,
"spec_prec_L4Quanti": 0.0,
"spec_recall_L4Quanti": 0.0,
"spec_qwk": 0.22580295138888895,
"spec_mae": 0.8925,
"spec_kripp_alpha": 0.2579634594689497
},
"_runtime": {
"encoder_mb": 274.843904,
"ms_per_sample": 5.865302347471394,
"throughput_per_s": 170.49419463109393,
"peak_vram_mb": 1287.34326171875,
"build_s": 0.4887635139748454
}
}