Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
297 lines
6.3 KiB
JSON
297 lines
6.3 KiB
JSON
{
|
|
"GPT-5.4": {
|
|
"cat_macro_f1": 0.16293893512410998,
|
|
"cat_weighted_f1": 0.1746727986514593,
|
|
"cat_macro_precision": 0.6289222195093943,
|
|
"cat_macro_recall": 0.23220413662370398,
|
|
"cat_mcc": 0.22345796853389935,
|
|
"cat_auc": 0.8960306312891495,
|
|
"cat_ece": 0.2080524676044782,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
8,
|
|
0,
|
|
0,
|
|
0,
|
|
221,
|
|
0,
|
|
1
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
1,
|
|
82,
|
|
0,
|
|
5
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
4,
|
|
0,
|
|
145,
|
|
0,
|
|
1
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
2,
|
|
3,
|
|
128,
|
|
0,
|
|
3
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
195,
|
|
0,
|
|
3
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
208,
|
|
2,
|
|
11
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
80,
|
|
0,
|
|
97
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.06722689075630252,
|
|
"cat_prec_BoardGov": 1.0,
|
|
"cat_recall_BoardGov": 0.034782608695652174,
|
|
"cat_f1_Incident": 0.0,
|
|
"cat_prec_Incident": 0.0,
|
|
"cat_recall_Incident": 0.0,
|
|
"cat_f1_Manageme": 0.05128205128205128,
|
|
"cat_prec_Manageme": 0.6666666666666666,
|
|
"cat_recall_Manageme": 0.02666666666666667,
|
|
"cat_f1_NoneOthe": 0.04285714285714286,
|
|
"cat_prec_NoneOthe": 0.75,
|
|
"cat_recall_NoneOthe": 0.022058823529411766,
|
|
"cat_f1_RiskMana": 0.31026252983293556,
|
|
"cat_prec_RiskMana": 0.18413597733711048,
|
|
"cat_recall_RiskMana": 0.9848484848484849,
|
|
"cat_f1_Strategy": 0.017937219730941704,
|
|
"cat_prec_Strategy": 1.0,
|
|
"cat_recall_Strategy": 0.00904977375565611,
|
|
"cat_f1_Third-Pa": 0.6510067114093959,
|
|
"cat_prec_Third-Pa": 0.8016528925619835,
|
|
"cat_recall_Third-Pa": 0.5480225988700564,
|
|
"cat_kripp_alpha": -0.08693512028952255,
|
|
"spec_macro_f1": 0.20854117827130608,
|
|
"spec_weighted_f1": 0.2571301750438355,
|
|
"spec_macro_precision": 0.3741612607031285,
|
|
"spec_macro_recall": 0.33018440069147115,
|
|
"spec_mcc": 0.1895317453505129,
|
|
"spec_auc": 0.8110497500610155,
|
|
"spec_ece": 0.44289420386155437,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
136,
|
|
473,
|
|
9,
|
|
0
|
|
],
|
|
[
|
|
4,
|
|
163,
|
|
1,
|
|
0
|
|
],
|
|
[
|
|
1,
|
|
179,
|
|
27,
|
|
0
|
|
],
|
|
[
|
|
2,
|
|
171,
|
|
34,
|
|
0
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.35742444152431013,
|
|
"spec_prec_L1Generi": 0.951048951048951,
|
|
"spec_recall_L1Generi": 0.22006472491909385,
|
|
"spec_f1_L2Domain": 0.2824956672443674,
|
|
"spec_prec_L2Domain": 0.16531440162271804,
|
|
"spec_recall_L2Domain": 0.9702380952380952,
|
|
"spec_f1_L3Firm-S": 0.19424460431654678,
|
|
"spec_prec_L3Firm-S": 0.38028169014084506,
|
|
"spec_recall_L3Firm-S": 0.13043478260869565,
|
|
"spec_f1_L4Quanti": 0.0,
|
|
"spec_prec_L4Quanti": 0.0,
|
|
"spec_recall_L4Quanti": 0.0,
|
|
"spec_qwk": 0.2326064604575444,
|
|
"spec_mae": 0.8825,
|
|
"spec_kripp_alpha": 0.26499611744119067
|
|
},
|
|
"Opus-4.6": {
|
|
"cat_macro_f1": 0.16861118726256397,
|
|
"cat_weighted_f1": 0.1792365613004711,
|
|
"cat_macro_precision": 0.6306758954840335,
|
|
"cat_macro_recall": 0.2357303291121537,
|
|
"cat_mcc": 0.2251562222131823,
|
|
"cat_auc": 0.8995073249291591,
|
|
"cat_ece": 0.19888580093781152,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
8,
|
|
0,
|
|
0,
|
|
0,
|
|
205,
|
|
0,
|
|
1
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
1,
|
|
73,
|
|
0,
|
|
5
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
4,
|
|
0,
|
|
154,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
2,
|
|
3,
|
|
133,
|
|
0,
|
|
3
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
208,
|
|
0,
|
|
5
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
216,
|
|
2,
|
|
10
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
70,
|
|
0,
|
|
97
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.07207207207207207,
|
|
"cat_prec_BoardGov": 1.0,
|
|
"cat_recall_BoardGov": 0.037383177570093455,
|
|
"cat_f1_Incident": 0.0,
|
|
"cat_prec_Incident": 0.0,
|
|
"cat_recall_Incident": 0.0,
|
|
"cat_f1_Manageme": 0.04878048780487805,
|
|
"cat_prec_Manageme": 0.6666666666666666,
|
|
"cat_recall_Manageme": 0.02531645569620253,
|
|
"cat_f1_NoneOthe": 0.041379310344827586,
|
|
"cat_prec_NoneOthe": 0.75,
|
|
"cat_recall_NoneOthe": 0.02127659574468085,
|
|
"cat_f1_RiskMana": 0.3270440251572327,
|
|
"cat_prec_RiskMana": 0.1964117091595845,
|
|
"cat_recall_RiskMana": 0.9765258215962441,
|
|
"cat_f1_Strategy": 0.017391304347826087,
|
|
"cat_prec_Strategy": 1.0,
|
|
"cat_recall_Strategy": 0.008771929824561403,
|
|
"cat_f1_Third-Pa": 0.6736111111111112,
|
|
"cat_prec_Third-Pa": 0.8016528925619835,
|
|
"cat_recall_Third-Pa": 0.5808383233532934,
|
|
"cat_kripp_alpha": -0.07941064783948448,
|
|
"spec_macro_f1": 0.19783939283519508,
|
|
"spec_weighted_f1": 0.24886714543281097,
|
|
"spec_macro_precision": 0.37592821714182745,
|
|
"spec_macro_recall": 0.3291807330600434,
|
|
"spec_mcc": 0.18219176358380398,
|
|
"spec_auc": 0.790090253498083,
|
|
"spec_ece": 0.45814307530721027,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
132,
|
|
466,
|
|
7,
|
|
0
|
|
],
|
|
[
|
|
1,
|
|
142,
|
|
2,
|
|
0
|
|
],
|
|
[
|
|
8,
|
|
221,
|
|
31,
|
|
0
|
|
],
|
|
[
|
|
2,
|
|
157,
|
|
31,
|
|
0
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.35294117647058826,
|
|
"spec_prec_L1Generi": 0.9230769230769231,
|
|
"spec_recall_L1Generi": 0.21818181818181817,
|
|
"spec_f1_L2Domain": 0.251105216622458,
|
|
"spec_prec_L2Domain": 0.1440162271805274,
|
|
"spec_recall_L2Domain": 0.9793103448275862,
|
|
"spec_f1_L3Firm-S": 0.18731117824773413,
|
|
"spec_prec_L3Firm-S": 0.43661971830985913,
|
|
"spec_recall_L3Firm-S": 0.11923076923076924,
|
|
"spec_f1_L4Quanti": 0.0,
|
|
"spec_prec_L4Quanti": 0.0,
|
|
"spec_recall_L4Quanti": 0.0,
|
|
"spec_qwk": 0.22580295138888895,
|
|
"spec_mae": 0.8925,
|
|
"spec_kripp_alpha": 0.2579634594689497
|
|
},
|
|
"_runtime": {
|
|
"encoder_mb": 274.843904,
|
|
"ms_per_sample": 5.865302347471394,
|
|
"throughput_per_s": 170.49419463109393,
|
|
"peak_vram_mb": 1287.34326171875,
|
|
"build_s": 0.4887635139748454
|
|
}
|
|
} |