Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
297 lines
6.3 KiB
JSON
297 lines
6.3 KiB
JSON
{
|
|
"GPT-5.4": {
|
|
"cat_macro_f1": 0.3536909012886116,
|
|
"cat_weighted_f1": 0.4058815979606338,
|
|
"cat_macro_precision": 0.6317997184487815,
|
|
"cat_macro_recall": 0.38979766446605063,
|
|
"cat_mcc": 0.42471542150657926,
|
|
"cat_auc": 0.9205800077405307,
|
|
"cat_ece": 0.09734637491405013,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
143,
|
|
0,
|
|
0,
|
|
0,
|
|
85,
|
|
0,
|
|
2
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
4,
|
|
73,
|
|
0,
|
|
11
|
|
],
|
|
[
|
|
3,
|
|
0,
|
|
20,
|
|
1,
|
|
124,
|
|
0,
|
|
2
|
|
],
|
|
[
|
|
1,
|
|
0,
|
|
1,
|
|
5,
|
|
122,
|
|
0,
|
|
7
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
185,
|
|
0,
|
|
13
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
180,
|
|
28,
|
|
13
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
22,
|
|
0,
|
|
155
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.7586206896551724,
|
|
"cat_prec_BoardGov": 0.9727891156462585,
|
|
"cat_recall_BoardGov": 0.6217391304347826,
|
|
"cat_f1_Incident": 0.0,
|
|
"cat_prec_Incident": 0.0,
|
|
"cat_recall_Incident": 0.0,
|
|
"cat_f1_Manageme": 0.23391812865497075,
|
|
"cat_prec_Manageme": 0.9523809523809523,
|
|
"cat_recall_Manageme": 0.13333333333333333,
|
|
"cat_f1_NoneOthe": 0.0684931506849315,
|
|
"cat_prec_NoneOthe": 0.5,
|
|
"cat_recall_NoneOthe": 0.03676470588235294,
|
|
"cat_f1_RiskMana": 0.3741152679474216,
|
|
"cat_prec_RiskMana": 0.23388116308470291,
|
|
"cat_recall_RiskMana": 0.9343434343434344,
|
|
"cat_f1_Strategy": 0.2248995983935743,
|
|
"cat_prec_Strategy": 1.0,
|
|
"cat_recall_Strategy": 0.12669683257918551,
|
|
"cat_f1_Third-Pa": 0.8157894736842105,
|
|
"cat_prec_Third-Pa": 0.7635467980295566,
|
|
"cat_recall_Third-Pa": 0.8757062146892656,
|
|
"cat_kripp_alpha": 0.27180867501339423,
|
|
"spec_macro_f1": 0.22049451330952025,
|
|
"spec_weighted_f1": 0.26278390857815354,
|
|
"spec_macro_precision": 0.4075440073341987,
|
|
"spec_macro_recall": 0.34148466970860386,
|
|
"spec_mcc": 0.20939315966102864,
|
|
"spec_auc": 0.8490039116946011,
|
|
"spec_ece": 0.43363295723994577,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
132,
|
|
483,
|
|
3,
|
|
0
|
|
],
|
|
[
|
|
2,
|
|
166,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
2,
|
|
171,
|
|
34,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
175,
|
|
32,
|
|
0
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.35013262599469497,
|
|
"spec_prec_L1Generi": 0.9705882352941176,
|
|
"spec_recall_L1Generi": 0.21359223300970873,
|
|
"spec_f1_L2Domain": 0.28546861564918313,
|
|
"spec_prec_L2Domain": 0.16683417085427135,
|
|
"spec_recall_L2Domain": 0.9880952380952381,
|
|
"spec_f1_L3Firm-S": 0.2463768115942029,
|
|
"spec_prec_L3Firm-S": 0.4927536231884058,
|
|
"spec_recall_L3Firm-S": 0.1642512077294686,
|
|
"spec_f1_L4Quanti": 0.0,
|
|
"spec_prec_L4Quanti": 0.0,
|
|
"spec_recall_L4Quanti": 0.0,
|
|
"spec_qwk": 0.24233251808742773,
|
|
"spec_mae": 0.8733333333333333,
|
|
"spec_kripp_alpha": 0.2761091078775676
|
|
},
|
|
"Opus-4.6": {
|
|
"cat_macro_f1": 0.35763512449392704,
|
|
"cat_weighted_f1": 0.40173099854659305,
|
|
"cat_macro_precision": 0.6354693148020794,
|
|
"cat_macro_recall": 0.39500680662311666,
|
|
"cat_mcc": 0.42166882753874363,
|
|
"cat_auc": 0.9209441610065957,
|
|
"cat_ece": 0.09567970824738346,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
141,
|
|
0,
|
|
0,
|
|
0,
|
|
71,
|
|
0,
|
|
2
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
4,
|
|
65,
|
|
0,
|
|
10
|
|
],
|
|
[
|
|
5,
|
|
0,
|
|
21,
|
|
1,
|
|
131,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
1,
|
|
0,
|
|
0,
|
|
5,
|
|
128,
|
|
0,
|
|
7
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
194,
|
|
0,
|
|
19
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
186,
|
|
28,
|
|
14
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
16,
|
|
0,
|
|
151
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.7811634349030471,
|
|
"cat_prec_BoardGov": 0.9591836734693877,
|
|
"cat_recall_BoardGov": 0.6588785046728972,
|
|
"cat_f1_Incident": 0.0,
|
|
"cat_prec_Incident": 0.0,
|
|
"cat_recall_Incident": 0.0,
|
|
"cat_f1_Manageme": 0.2346368715083799,
|
|
"cat_prec_Manageme": 1.0,
|
|
"cat_recall_Manageme": 0.13291139240506328,
|
|
"cat_f1_NoneOthe": 0.06622516556291391,
|
|
"cat_prec_NoneOthe": 0.5,
|
|
"cat_recall_NoneOthe": 0.03546099290780142,
|
|
"cat_f1_RiskMana": 0.38645418326693226,
|
|
"cat_prec_RiskMana": 0.24525916561314792,
|
|
"cat_recall_RiskMana": 0.9107981220657277,
|
|
"cat_f1_Strategy": 0.21875,
|
|
"cat_prec_Strategy": 1.0,
|
|
"cat_recall_Strategy": 0.12280701754385964,
|
|
"cat_f1_Third-Pa": 0.8162162162162162,
|
|
"cat_prec_Third-Pa": 0.7438423645320197,
|
|
"cat_recall_Third-Pa": 0.9041916167664671,
|
|
"cat_kripp_alpha": 0.27338793761748126,
|
|
"spec_macro_f1": 0.20754679251319788,
|
|
"spec_weighted_f1": 0.25637242485646744,
|
|
"spec_macro_precision": 0.40946072005380696,
|
|
"spec_macro_recall": 0.33929593134138586,
|
|
"spec_mcc": 0.2041103760829744,
|
|
"spec_auc": 0.8271022317290393,
|
|
"spec_ece": 0.4489923599362374,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
130,
|
|
473,
|
|
2,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
145,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
6,
|
|
217,
|
|
37,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
160,
|
|
30,
|
|
0
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.3508771929824561,
|
|
"spec_prec_L1Generi": 0.9558823529411765,
|
|
"spec_recall_L1Generi": 0.21487603305785125,
|
|
"spec_f1_L2Domain": 0.2543859649122807,
|
|
"spec_prec_L2Domain": 0.1457286432160804,
|
|
"spec_recall_L2Domain": 1.0,
|
|
"spec_f1_L3Firm-S": 0.22492401215805471,
|
|
"spec_prec_L3Firm-S": 0.5362318840579711,
|
|
"spec_recall_L3Firm-S": 0.1423076923076923,
|
|
"spec_f1_L4Quanti": 0.0,
|
|
"spec_prec_L4Quanti": 0.0,
|
|
"spec_recall_L4Quanti": 0.0,
|
|
"spec_qwk": 0.24096533359991634,
|
|
"spec_mae": 0.88,
|
|
"spec_kripp_alpha": 0.2758412395136435
|
|
},
|
|
"_runtime": {
|
|
"encoder_mb": 274.843904,
|
|
"ms_per_sample": 5.861402786540566,
|
|
"throughput_per_s": 170.607623536175,
|
|
"peak_vram_mb": 1287.34326171875,
|
|
"build_s": 0.4908116469741799
|
|
}
|
|
} |