Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
297 lines
6.5 KiB
JSON
297 lines
6.5 KiB
JSON
{
|
|
"GPT-5.4": {
|
|
"cat_macro_f1": 0.9344870894825886,
|
|
"cat_weighted_f1": 0.9351173265780133,
|
|
"cat_macro_precision": 0.9326512314038842,
|
|
"cat_macro_recall": 0.9387442461546238,
|
|
"cat_mcc": 0.9237006314618685,
|
|
"cat_auc": 0.992309699625497,
|
|
"cat_ece": 0.05415941931307314,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
225,
|
|
0,
|
|
3,
|
|
0,
|
|
2,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
85,
|
|
0,
|
|
0,
|
|
2,
|
|
1,
|
|
0
|
|
],
|
|
[
|
|
2,
|
|
0,
|
|
145,
|
|
1,
|
|
2,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
3,
|
|
131,
|
|
0,
|
|
2,
|
|
0
|
|
],
|
|
[
|
|
6,
|
|
1,
|
|
5,
|
|
19,
|
|
164,
|
|
1,
|
|
2
|
|
],
|
|
[
|
|
0,
|
|
3,
|
|
1,
|
|
8,
|
|
2,
|
|
207,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
12,
|
|
0,
|
|
165
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.9719222462203023,
|
|
"cat_prec_BoardGov": 0.9656652360515021,
|
|
"cat_recall_BoardGov": 0.9782608695652174,
|
|
"cat_f1_Incident": 0.96045197740113,
|
|
"cat_prec_Incident": 0.9550561797752809,
|
|
"cat_recall_Incident": 0.9659090909090909,
|
|
"cat_f1_Manageme": 0.9446254071661238,
|
|
"cat_prec_Manageme": 0.9235668789808917,
|
|
"cat_recall_Manageme": 0.9666666666666667,
|
|
"cat_f1_NoneOthe": 0.888135593220339,
|
|
"cat_prec_NoneOthe": 0.8238993710691824,
|
|
"cat_recall_NoneOthe": 0.9632352941176471,
|
|
"cat_f1_RiskMana": 0.8586387434554974,
|
|
"cat_prec_RiskMana": 0.8913043478260869,
|
|
"cat_recall_RiskMana": 0.8282828282828283,
|
|
"cat_f1_Strategy": 0.9583333333333334,
|
|
"cat_prec_Strategy": 0.981042654028436,
|
|
"cat_recall_Strategy": 0.9366515837104072,
|
|
"cat_f1_Third-Pa": 0.9593023255813954,
|
|
"cat_prec_Third-Pa": 0.9880239520958084,
|
|
"cat_recall_Third-Pa": 0.9322033898305084,
|
|
"cat_kripp_alpha": 0.9233443339647499,
|
|
"spec_macro_f1": 0.8941203230194683,
|
|
"spec_weighted_f1": 0.9115075208518084,
|
|
"spec_macro_precision": 0.8957148694260108,
|
|
"spec_macro_recall": 0.892931893103379,
|
|
"spec_mcc": 0.8651929532300995,
|
|
"spec_auc": 0.981624069084201,
|
|
"spec_ece": 0.06980206420024232,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
579,
|
|
24,
|
|
12,
|
|
3
|
|
],
|
|
[
|
|
29,
|
|
131,
|
|
6,
|
|
2
|
|
],
|
|
[
|
|
10,
|
|
6,
|
|
189,
|
|
2
|
|
],
|
|
[
|
|
2,
|
|
1,
|
|
9,
|
|
195
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.9353796445880452,
|
|
"spec_prec_L1Generi": 0.9338709677419355,
|
|
"spec_recall_L1Generi": 0.9368932038834952,
|
|
"spec_f1_L2Domain": 0.793939393939394,
|
|
"spec_prec_L2Domain": 0.808641975308642,
|
|
"spec_recall_L2Domain": 0.7797619047619048,
|
|
"spec_f1_L3Firm-S": 0.8936170212765957,
|
|
"spec_prec_L3Firm-S": 0.875,
|
|
"spec_recall_L3Firm-S": 0.9130434782608695,
|
|
"spec_f1_L4Quanti": 0.9535452322738386,
|
|
"spec_prec_L4Quanti": 0.9653465346534653,
|
|
"spec_recall_L4Quanti": 0.9420289855072463,
|
|
"spec_qwk": 0.9329693660903852,
|
|
"spec_mae": 0.1175,
|
|
"spec_kripp_alpha": 0.9181842655510584
|
|
},
|
|
"Opus-4.6": {
|
|
"cat_macro_f1": 0.9234810481200378,
|
|
"cat_weighted_f1": 0.9224737817442137,
|
|
"cat_macro_precision": 0.9185473372257941,
|
|
"cat_macro_recall": 0.9325102491414775,
|
|
"cat_mcc": 0.9102750101817324,
|
|
"cat_auc": 0.9940184741579791,
|
|
"cat_ece": 0.0641141641388337,
|
|
"cat_confusion_matrix": [
|
|
[
|
|
211,
|
|
0,
|
|
1,
|
|
1,
|
|
1,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
78,
|
|
0,
|
|
0,
|
|
1,
|
|
0,
|
|
0
|
|
],
|
|
[
|
|
8,
|
|
0,
|
|
145,
|
|
1,
|
|
3,
|
|
0,
|
|
1
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
1,
|
|
138,
|
|
1,
|
|
1,
|
|
0
|
|
],
|
|
[
|
|
13,
|
|
0,
|
|
9,
|
|
14,
|
|
169,
|
|
1,
|
|
7
|
|
],
|
|
[
|
|
1,
|
|
11,
|
|
1,
|
|
4,
|
|
3,
|
|
208,
|
|
0
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
0,
|
|
1,
|
|
6,
|
|
1,
|
|
159
|
|
]
|
|
],
|
|
"cat_f1_BoardGov": 0.9440715883668904,
|
|
"cat_prec_BoardGov": 0.9055793991416309,
|
|
"cat_recall_BoardGov": 0.985981308411215,
|
|
"cat_f1_Incident": 0.9285714285714286,
|
|
"cat_prec_Incident": 0.8764044943820225,
|
|
"cat_recall_Incident": 0.9873417721518988,
|
|
"cat_f1_Manageme": 0.9206349206349206,
|
|
"cat_prec_Manageme": 0.9235668789808917,
|
|
"cat_recall_Manageme": 0.9177215189873418,
|
|
"cat_f1_NoneOthe": 0.92,
|
|
"cat_prec_NoneOthe": 0.8679245283018868,
|
|
"cat_recall_NoneOthe": 0.9787234042553191,
|
|
"cat_f1_RiskMana": 0.8513853904282116,
|
|
"cat_prec_RiskMana": 0.9184782608695652,
|
|
"cat_recall_RiskMana": 0.7934272300469484,
|
|
"cat_f1_Strategy": 0.9476082004555809,
|
|
"cat_prec_Strategy": 0.985781990521327,
|
|
"cat_recall_Strategy": 0.9122807017543859,
|
|
"cat_f1_Third-Pa": 0.9520958083832335,
|
|
"cat_prec_Third-Pa": 0.9520958083832335,
|
|
"cat_recall_Third-Pa": 0.9520958083832335,
|
|
"cat_kripp_alpha": 0.9095617653952504,
|
|
"spec_macro_f1": 0.8814731397444973,
|
|
"spec_weighted_f1": 0.8981338362706646,
|
|
"spec_macro_precision": 0.8833981471623865,
|
|
"spec_macro_recall": 0.8849913986360116,
|
|
"spec_mcc": 0.8465512998506631,
|
|
"spec_auc": 0.9729999946345258,
|
|
"spec_ece": 0.08370273689428968,
|
|
"spec_confusion_matrix": [
|
|
[
|
|
564,
|
|
33,
|
|
7,
|
|
1
|
|
],
|
|
[
|
|
22,
|
|
119,
|
|
2,
|
|
2
|
|
],
|
|
[
|
|
34,
|
|
10,
|
|
206,
|
|
10
|
|
],
|
|
[
|
|
0,
|
|
0,
|
|
1,
|
|
189
|
|
]
|
|
],
|
|
"spec_f1_L1Generi": 0.9208163265306123,
|
|
"spec_prec_L1Generi": 0.9096774193548387,
|
|
"spec_recall_L1Generi": 0.9322314049586777,
|
|
"spec_f1_L2Domain": 0.7752442996742671,
|
|
"spec_prec_L2Domain": 0.7345679012345679,
|
|
"spec_recall_L2Domain": 0.8206896551724138,
|
|
"spec_f1_L3Firm-S": 0.865546218487395,
|
|
"spec_prec_L3Firm-S": 0.9537037037037037,
|
|
"spec_recall_L3Firm-S": 0.7923076923076923,
|
|
"spec_f1_L4Quanti": 0.9642857142857143,
|
|
"spec_prec_L4Quanti": 0.9356435643564357,
|
|
"spec_recall_L4Quanti": 0.9947368421052631,
|
|
"spec_qwk": 0.9207708779443254,
|
|
"spec_mae": 0.13916666666666666,
|
|
"spec_kripp_alpha": 0.9033268512180281
|
|
},
|
|
"_runtime": {
|
|
"encoder_mb": 789.563648,
|
|
"ms_per_sample": 6.078403938445263,
|
|
"throughput_per_s": 164.5168715549004,
|
|
"peak_vram_mb": 1416.36376953125,
|
|
"build_s": 0.5027359619853087
|
|
}
|
|
} |