Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
297 lines
6.4 KiB
JSON
297 lines
6.4 KiB
JSON
{
|
|
"GPT-5.4": {
|
|
"cat_macro_f1": 0.9360988760303737,
|
|
"cat_weighted_f1": 0.9367630863906107,
|
|
"cat_macro_precision": 0.934342558672944,
|
|
"cat_macro_recall": 0.9404157843351134,
|
|
"cat_mcc": 0.9256911778959798,
|
|
"cat_auc": 0.9918112947607864,
|
|
"cat_ece": 0.052939765204985965,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
226,
|
|
0,
|
|
2,
|
|
0,
|
|
2,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
85,
|
|
0,
|
|
0,
|
|
2,
|
|
1,
|
|
0
|
|
],
|
|
[
|
|
2,
|
|
0,
|
|
145,
|
|
1,
|
|
2,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
3,
|
|
132,
|
|
0,
|
|
1,
|
|
0
|
|
],
|
|
[
|
|
6,
|
|
1,
|
|
5,
|
|
19,
|
|
164,
|
|
1,
|
|
2
|
|
],
|
|
[
|
|
0,
|
|
3,
|
|
1,
|
|
8,
|
|
2,
|
|
207,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
12,
|
|
0,
|
|
165
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.9741379310344828,
|
|
"cat_prec_BoardGov": 0.9658119658119658,
|
|
"cat_recall_BoardGov": 0.9826086956521739,
|
|
"cat_f1_Incident": 0.96045197740113,
|
|
"cat_prec_Incident": 0.9550561797752809,
|
|
"cat_recall_Incident": 0.9659090909090909,
|
|
"cat_f1_Manageme": 0.9477124183006536,
|
|
"cat_prec_Manageme": 0.9294871794871795,
|
|
"cat_recall_Manageme": 0.9666666666666667,
|
|
"cat_f1_NoneOthe": 0.8918918918918919,
|
|
"cat_prec_NoneOthe": 0.825,
|
|
"cat_recall_NoneOthe": 0.9705882352941176,
|
|
"cat_f1_RiskMana": 0.8586387434554974,
|
|
"cat_prec_RiskMana": 0.8913043478260869,
|
|
"cat_recall_RiskMana": 0.8282828282828283,
|
|
"cat_f1_Strategy": 0.9605568445475638,
|
|
"cat_prec_Strategy": 0.9857142857142858,
|
|
"cat_recall_Strategy": 0.9366515837104072,
|
|
"cat_f1_Third-Pa": 0.9593023255813954,
|
|
"cat_prec_Third-Pa": 0.9880239520958084,
|
|
"cat_recall_Third-Pa": 0.9322033898305084,
|
|
"cat_kripp_alpha": 0.9253092213149172,
|
|
"spec_macro_f1": 0.8986323186392307,
|
|
"spec_weighted_f1": 0.9144644120807768,
|
|
"spec_macro_precision": 0.9034925881673722,
|
|
"spec_macro_recall": 0.8950728490354916,
|
|
"spec_mcc": 0.870090391628814,
|
|
"spec_auc": 0.98134918835569,
|
|
"spec_ece": 0.06740866973996164,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
582,
|
|
19,
|
|
14,
|
|
3
|
|
],
|
|
[
|
|
29,
|
|
130,
|
|
7,
|
|
2
|
|
],
|
|
[
|
|
12,
|
|
3,
|
|
190,
|
|
2
|
|
],
|
|
[
|
|
2,
|
|
1,
|
|
8,
|
|
196
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.9364440868865648,
|
|
"spec_prec_L1Generi": 0.9312,
|
|
"spec_recall_L1Generi": 0.941747572815534,
|
|
"spec_f1_L2Domain": 0.8099688473520249,
|
|
"spec_prec_L2Domain": 0.8496732026143791,
|
|
"spec_recall_L2Domain": 0.7738095238095238,
|
|
"spec_f1_L3Firm-S": 0.892018779342723,
|
|
"spec_prec_L3Firm-S": 0.867579908675799,
|
|
"spec_recall_L3Firm-S": 0.9178743961352657,
|
|
"spec_f1_L4Quanti": 0.9560975609756097,
|
|
"spec_prec_L4Quanti": 0.9655172413793104,
|
|
"spec_recall_L4Quanti": 0.9468599033816425,
|
|
"spec_qwk": 0.9307948020550015,
|
|
"spec_mae": 0.1175,
|
|
"spec_kripp_alpha": 0.9166492249745117
|
|
},
|
|
"Opus-4.6": {
|
|
"cat_macro_f1": 0.9235105849558979,
|
|
"cat_weighted_f1": 0.9224780370334836,
|
|
"cat_macro_precision": 0.9187130112710481,
|
|
"cat_macro_recall": 0.9326192612354074,
|
|
"cat_mcc": 0.9103198007176273,
|
|
"cat_auc": 0.9937246318315877,
|
|
"cat_ece": 0.06465620135267579,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
211,
|
|
0,
|
|
1,
|
|
1,
|
|
1,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
78,
|
|
0,
|
|
0,
|
|
1,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
9,
|
|
0,
|
|
144,
|
|
1,
|
|
3,
|
|
0,
|
|
1
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
1,
|
|
139,
|
|
1,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
13,
|
|
0,
|
|
9,
|
|
14,
|
|
169,
|
|
1,
|
|
7
|
|
],
|
|
[
|
|
1,
|
|
11,
|
|
1,
|
|
4,
|
|
3,
|
|
208,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
1,
|
|
6,
|
|
1,
|
|
159
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.9419642857142857,
|
|
"cat_prec_BoardGov": 0.9017094017094017,
|
|
"cat_recall_BoardGov": 0.985981308411215,
|
|
"cat_f1_Incident": 0.9285714285714286,
|
|
"cat_prec_Incident": 0.8764044943820225,
|
|
"cat_recall_Incident": 0.9873417721518988,
|
|
"cat_f1_Manageme": 0.9171974522292994,
|
|
"cat_prec_Manageme": 0.9230769230769231,
|
|
"cat_recall_Manageme": 0.9113924050632911,
|
|
"cat_f1_NoneOthe": 0.9235880398671097,
|
|
"cat_prec_NoneOthe": 0.86875,
|
|
"cat_recall_NoneOthe": 0.9858156028368794,
|
|
"cat_f1_RiskMana": 0.8513853904282116,
|
|
"cat_prec_RiskMana": 0.9184782608695652,
|
|
"cat_recall_RiskMana": 0.7934272300469484,
|
|
"cat_f1_Strategy": 0.9497716894977168,
|
|
"cat_prec_Strategy": 0.9904761904761905,
|
|
"cat_recall_Strategy": 0.9122807017543859,
|
|
"cat_f1_Third-Pa": 0.9520958083832335,
|
|
"cat_prec_Third-Pa": 0.9520958083832335,
|
|
"cat_recall_Third-Pa": 0.9520958083832335,
|
|
"cat_kripp_alpha": 0.9095619506866199,
|
|
"spec_macro_f1": 0.8826923642825633,
|
|
"spec_weighted_f1": 0.8991699562480843,
|
|
"spec_macro_precision": 0.8862949086294886,
|
|
"spec_macro_recall": 0.8831960153359262,
|
|
"spec_mcc": 0.8485449936701916,
|
|
"spec_auc": 0.9725823165743999,
|
|
"spec_ece": 0.083350846717755,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
568,
|
|
27,
|
|
9,
|
|
1
|
|
],
|
|
[
|
|
23,
|
|
117,
|
|
3,
|
|
2
|
|
],
|
|
[
|
|
34,
|
|
9,
|
|
206,
|
|
11
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
1,
|
|
189
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.9235772357723577,
|
|
"spec_prec_L1Generi": 0.9088,
|
|
"spec_recall_L1Generi": 0.9388429752066115,
|
|
"spec_f1_L2Domain": 0.785234899328859,
|
|
"spec_prec_L2Domain": 0.7647058823529411,
|
|
"spec_recall_L2Domain": 0.8068965517241379,
|
|
"spec_f1_L3Firm-S": 0.860125260960334,
|
|
"spec_prec_L3Firm-S": 0.9406392694063926,
|
|
"spec_recall_L3Firm-S": 0.7923076923076923,
|
|
"spec_f1_L4Quanti": 0.9618320610687023,
|
|
"spec_prec_L4Quanti": 0.9310344827586207,
|
|
"spec_recall_L4Quanti": 0.9947368421052631,
|
|
"spec_qwk": 0.9198415117342273,
|
|
"spec_mae": 0.13916666666666666,
|
|
"spec_kripp_alpha": 0.9038906079654127
|
|
},
|
|
"_runtime": {
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 7.762363941583317,
|
|
"throughput_per_s": 128.82673468103667,
|
|
"peak_vram_mb": 2135.203125,
|
|
"build_s": 1.1878160500200465
|
|
}
|
|
} |