{ "best-base_weighted_ce-ep5_vs_GPT-5.4": { "cat_macro_f1": 0.9360575579144376, "cat_weighted_f1": 0.9360564701876355, "cat_macro_precision": 0.9336791798534633, "cat_macro_recall": 0.9414080218768329, "cat_mcc": 0.9248088496355107, "cat_auc": 0.991343460842945, "cat_ece": 0.0441274690628052, "cat_confusion_matrix": [ [ 220, 0, 8, 0, 1, 0, 1 ], [ 0, 86, 0, 0, 1, 1, 0 ], [ 1, 0, 143, 1, 5, 0, 0 ], [ 0, 0, 2, 133, 0, 1, 0 ], [ 6, 0, 7, 18, 165, 1, 1 ], [ 0, 3, 1, 8, 2, 207, 0 ], [ 0, 0, 0, 1, 6, 1, 169 ] ], "cat_f1_BoardGov": 0.962800875273523, "cat_prec_BoardGov": 0.9691629955947136, "cat_recall_BoardGov": 0.9565217391304348, "cat_f1_Incident": 0.9717514124293786, "cat_prec_Incident": 0.9662921348314607, "cat_recall_Incident": 0.9772727272727273, "cat_f1_Manageme": 0.9196141479099679, "cat_prec_Manageme": 0.8881987577639752, "cat_recall_Manageme": 0.9533333333333334, "cat_f1_NoneOthe": 0.8956228956228957, "cat_prec_NoneOthe": 0.8260869565217391, "cat_recall_NoneOthe": 0.9779411764705882, "cat_f1_RiskMana": 0.873015873015873, "cat_prec_RiskMana": 0.9166666666666666, "cat_recall_RiskMana": 0.8333333333333334, "cat_f1_Strategy": 0.9583333333333334, "cat_prec_Strategy": 0.981042654028436, "cat_recall_Strategy": 0.9366515837104072, "cat_f1_Third-Pa": 0.9712643678160919, "cat_prec_Third-Pa": 0.9883040935672515, "cat_recall_Third-Pa": 0.9548022598870056, "cat_kripp_alpha": 0.9243601922903683, "spec_macro_f1": 0.5970357338282843, "spec_weighted_f1": 0.7040798408451929, "spec_macro_precision": 0.7225196233593912, "spec_macro_recall": 0.6139005306639329, "spec_mcc": 0.6138700055328291, "spec_auc": 0.9498756282617218, "spec_ece": 0.1652249880135059, "spec_confusion_matrix": [ [ 596, 6, 11, 5 ], [ 105, 46, 9, 8 ], [ 14, 6, 52, 135 ], [ 4, 0, 3, 200 ] ], "spec_f1_L1Generi": 0.8915482423335827, "spec_prec_L1Generi": 0.8289290681502086, "spec_recall_L1Generi": 0.9644012944983819, "spec_f1_L2Domain": 0.40707964601769914, "spec_prec_L2Domain": 0.7931034482758621, "spec_recall_L2Domain": 0.27380952380952384, "spec_f1_L3Firm-S": 0.36879432624113473, "spec_prec_L3Firm-S": 0.6933333333333334, "spec_recall_L3Firm-S": 0.25120772946859904, "spec_f1_L4Quanti": 0.7207207207207207, "spec_prec_L4Quanti": 0.5747126436781609, "spec_recall_L4Quanti": 0.966183574879227, "spec_qwk": 0.8757404773441285, "spec_mae": 0.2975, "spec_kripp_alpha": 0.8479072400833478, "total_time_s": 6.695346015971154, "num_samples": 1200, "avg_ms_per_sample": 5.579455013309295, "combined_macro_f1": 0.766546645871361 }, "best-base_weighted_ce-ep5_vs_Opus-4.6": { "cat_macro_f1": 0.9280167387549427, "cat_weighted_f1": 0.9273898648954128, "cat_macro_precision": 0.9223465490796974, "cat_macro_recall": 0.9382296607170699, "cat_mcc": 0.9162751746063641, "cat_auc": 0.992382433433919, "cat_ece": 0.04689237485329312, "cat_confusion_matrix": [ [ 209, 0, 2, 1, 1, 0, 1 ], [ 0, 78, 0, 0, 1, 0, 0 ], [ 4, 0, 147, 2, 4, 0, 1 ], [ 0, 0, 1, 139, 1, 0, 0 ], [ 13, 1, 10, 15, 168, 1, 5 ], [ 1, 10, 1, 4, 3, 209, 0 ], [ 0, 0, 0, 0, 2, 1, 164 ] ], "cat_f1_BoardGov": 0.9478458049886621, "cat_prec_BoardGov": 0.920704845814978, "cat_recall_BoardGov": 0.9766355140186916, "cat_f1_Incident": 0.9285714285714286, "cat_prec_Incident": 0.8764044943820225, "cat_recall_Incident": 0.9873417721518988, "cat_f1_Manageme": 0.9216300940438872, "cat_prec_Manageme": 0.9130434782608695, "cat_recall_Manageme": 0.930379746835443, "cat_f1_NoneOthe": 0.9205298013245033, "cat_prec_NoneOthe": 0.8633540372670807, "cat_recall_NoneOthe": 0.9858156028368794, "cat_f1_RiskMana": 0.8549618320610687, "cat_prec_RiskMana": 0.9333333333333333, "cat_recall_RiskMana": 0.7887323943661971, "cat_f1_Strategy": 0.9521640091116174, "cat_prec_Strategy": 0.990521327014218, "cat_recall_Strategy": 0.9166666666666666, "cat_f1_Third-Pa": 0.9704142011834319, "cat_prec_Third-Pa": 0.9590643274853801, "cat_recall_Third-Pa": 0.9820359281437125, "cat_kripp_alpha": 0.9154955768233572, "spec_macro_f1": 0.5957642708821952, "spec_weighted_f1": 0.693005282664721, "spec_macro_precision": 0.731933400476396, "spec_macro_recall": 0.6249872364065566, "spec_mcc": 0.6143201053040909, "spec_auc": 0.9470735892830423, "spec_ece": 0.18189165468017254, "spec_confusion_matrix": [ [ 592, 4, 4, 5 ], [ 92, 42, 6, 5 ], [ 35, 12, 63, 150 ], [ 0, 0, 2, 188 ] ], "spec_f1_L1Generi": 0.8942598187311178, "spec_prec_L1Generi": 0.8233657858136301, "spec_recall_L1Generi": 0.9785123966942149, "spec_f1_L2Domain": 0.41379310344827586, "spec_prec_L2Domain": 0.7241379310344828, "spec_recall_L2Domain": 0.2896551724137931, "spec_f1_L3Firm-S": 0.3761194029850746, "spec_prec_L3Firm-S": 0.84, "spec_recall_L3Firm-S": 0.2423076923076923, "spec_f1_L4Quanti": 0.6988847583643123, "spec_prec_L4Quanti": 0.5402298850574713, "spec_recall_L4Quanti": 0.9894736842105263, "spec_qwk": 0.872110225054491, "spec_mae": 0.3075, "spec_kripp_alpha": 0.8502616991488389, "total_time_s": 6.695346015971154, "num_samples": 1200, "avg_ms_per_sample": 5.579455013309295, "combined_macro_f1": 0.7618905048185689 } }