Joey Eamigh 67beaede45
quantization + onnx sweeps
Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).

Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.

Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
2026-04-07 05:10:38 -04:00

297 lines
6.3 KiB
JSON

{
"GPT-5.4": {
"cat_macro_f1": 0.3536909012886116,
"cat_weighted_f1": 0.4058815979606338,
"cat_macro_precision": 0.6317997184487815,
"cat_macro_recall": 0.38979766446605063,
"cat_mcc": 0.42471542150657926,
"cat_auc": 0.9205800077405307,
"cat_ece": 0.09734637491405013,
"cat_confusion_matrix": [
[
143,
0,
0,
0,
85,
0,
2
],
[
0,
0,
0,
4,
73,
0,
11
],
[
3,
0,
20,
1,
124,
0,
2
],
[
1,
0,
1,
5,
122,
0,
7
],
[
0,
0,
0,
0,
185,
0,
13
],
[
0,
0,
0,
0,
180,
28,
13
],
[
0,
0,
0,
0,
22,
0,
155
]
],
"cat_f1_BoardGov": 0.7586206896551724,
"cat_prec_BoardGov": 0.9727891156462585,
"cat_recall_BoardGov": 0.6217391304347826,
"cat_f1_Incident": 0.0,
"cat_prec_Incident": 0.0,
"cat_recall_Incident": 0.0,
"cat_f1_Manageme": 0.23391812865497075,
"cat_prec_Manageme": 0.9523809523809523,
"cat_recall_Manageme": 0.13333333333333333,
"cat_f1_NoneOthe": 0.0684931506849315,
"cat_prec_NoneOthe": 0.5,
"cat_recall_NoneOthe": 0.03676470588235294,
"cat_f1_RiskMana": 0.3741152679474216,
"cat_prec_RiskMana": 0.23388116308470291,
"cat_recall_RiskMana": 0.9343434343434344,
"cat_f1_Strategy": 0.2248995983935743,
"cat_prec_Strategy": 1.0,
"cat_recall_Strategy": 0.12669683257918551,
"cat_f1_Third-Pa": 0.8157894736842105,
"cat_prec_Third-Pa": 0.7635467980295566,
"cat_recall_Third-Pa": 0.8757062146892656,
"cat_kripp_alpha": 0.27180867501339423,
"spec_macro_f1": 0.22049451330952025,
"spec_weighted_f1": 0.26278390857815354,
"spec_macro_precision": 0.4075440073341987,
"spec_macro_recall": 0.34148466970860386,
"spec_mcc": 0.20939315966102864,
"spec_auc": 0.8490039116946011,
"spec_ece": 0.43363295723994577,
"spec_confusion_matrix": [
[
132,
483,
3,
0
],
[
2,
166,
0,
0
],
[
2,
171,
34,
0
],
[
0,
175,
32,
0
]
],
"spec_f1_L1Generi": 0.35013262599469497,
"spec_prec_L1Generi": 0.9705882352941176,
"spec_recall_L1Generi": 0.21359223300970873,
"spec_f1_L2Domain": 0.28546861564918313,
"spec_prec_L2Domain": 0.16683417085427135,
"spec_recall_L2Domain": 0.9880952380952381,
"spec_f1_L3Firm-S": 0.2463768115942029,
"spec_prec_L3Firm-S": 0.4927536231884058,
"spec_recall_L3Firm-S": 0.1642512077294686,
"spec_f1_L4Quanti": 0.0,
"spec_prec_L4Quanti": 0.0,
"spec_recall_L4Quanti": 0.0,
"spec_qwk": 0.24233251808742773,
"spec_mae": 0.8733333333333333,
"spec_kripp_alpha": 0.2761091078775676
},
"Opus-4.6": {
"cat_macro_f1": 0.35763512449392704,
"cat_weighted_f1": 0.40173099854659305,
"cat_macro_precision": 0.6354693148020794,
"cat_macro_recall": 0.39500680662311666,
"cat_mcc": 0.42166882753874363,
"cat_auc": 0.9209441610065957,
"cat_ece": 0.09567970824738346,
"cat_confusion_matrix": [
[
141,
0,
0,
0,
71,
0,
2
],
[
0,
0,
0,
4,
65,
0,
10
],
[
5,
0,
21,
1,
131,
0,
0
],
[
1,
0,
0,
5,
128,
0,
7
],
[
0,
0,
0,
0,
194,
0,
19
],
[
0,
0,
0,
0,
186,
28,
14
],
[
0,
0,
0,
0,
16,
0,
151
]
],
"cat_f1_BoardGov": 0.7811634349030471,
"cat_prec_BoardGov": 0.9591836734693877,
"cat_recall_BoardGov": 0.6588785046728972,
"cat_f1_Incident": 0.0,
"cat_prec_Incident": 0.0,
"cat_recall_Incident": 0.0,
"cat_f1_Manageme": 0.2346368715083799,
"cat_prec_Manageme": 1.0,
"cat_recall_Manageme": 0.13291139240506328,
"cat_f1_NoneOthe": 0.06622516556291391,
"cat_prec_NoneOthe": 0.5,
"cat_recall_NoneOthe": 0.03546099290780142,
"cat_f1_RiskMana": 0.38645418326693226,
"cat_prec_RiskMana": 0.24525916561314792,
"cat_recall_RiskMana": 0.9107981220657277,
"cat_f1_Strategy": 0.21875,
"cat_prec_Strategy": 1.0,
"cat_recall_Strategy": 0.12280701754385964,
"cat_f1_Third-Pa": 0.8162162162162162,
"cat_prec_Third-Pa": 0.7438423645320197,
"cat_recall_Third-Pa": 0.9041916167664671,
"cat_kripp_alpha": 0.27338793761748126,
"spec_macro_f1": 0.20754679251319788,
"spec_weighted_f1": 0.25637242485646744,
"spec_macro_precision": 0.40946072005380696,
"spec_macro_recall": 0.33929593134138586,
"spec_mcc": 0.2041103760829744,
"spec_auc": 0.8271022317290393,
"spec_ece": 0.4489923599362374,
"spec_confusion_matrix": [
[
130,
473,
2,
0
],
[
0,
145,
0,
0
],
[
6,
217,
37,
0
],
[
0,
160,
30,
0
]
],
"spec_f1_L1Generi": 0.3508771929824561,
"spec_prec_L1Generi": 0.9558823529411765,
"spec_recall_L1Generi": 0.21487603305785125,
"spec_f1_L2Domain": 0.2543859649122807,
"spec_prec_L2Domain": 0.1457286432160804,
"spec_recall_L2Domain": 1.0,
"spec_f1_L3Firm-S": 0.22492401215805471,
"spec_prec_L3Firm-S": 0.5362318840579711,
"spec_recall_L3Firm-S": 0.1423076923076923,
"spec_f1_L4Quanti": 0.0,
"spec_prec_L4Quanti": 0.0,
"spec_recall_L4Quanti": 0.0,
"spec_qwk": 0.24096533359991634,
"spec_mae": 0.88,
"spec_kripp_alpha": 0.2758412395136435
},
"_runtime": {
"encoder_mb": 274.843904,
"ms_per_sample": 5.861402786540566,
"throughput_per_s": 170.607623536175,
"peak_vram_mb": 1287.34326171875,
"build_s": 0.4908116469741799
}
}