{ "ensemble-3seed_vs_GPT-5.4": { "cat_macro_f1": 0.9382530391727061, "cat_weighted_f1": 0.9385858996685268, "cat_macro_precision": 0.937038491784886, "cat_macro_recall": 0.9417984783962936, "cat_mcc": 0.9275970467019695, "cat_auc": 0.9930606345789074, "cat_ece": 0.05087702547510463, "cat_confusion_matrix": [ [ 225, 0, 3, 0, 2, 0, 0 ], [ 0, 85, 0, 0, 2, 1, 0 ], [ 2, 0, 145, 1, 2, 0, 0 ], [ 0, 0, 3, 132, 0, 1, 0 ], [ 6, 1, 4, 18, 167, 1, 1 ], [ 0, 2, 1, 8, 2, 208, 0 ], [ 0, 0, 0, 0, 13, 0, 164 ] ], "cat_f1_BoardGov": 0.9719222462203023, "cat_prec_BoardGov": 0.9656652360515021, "cat_recall_BoardGov": 0.9782608695652174, "cat_f1_Incident": 0.9659090909090909, "cat_prec_Incident": 0.9659090909090909, "cat_recall_Incident": 0.9659090909090909, "cat_f1_Manageme": 0.9477124183006536, "cat_prec_Manageme": 0.9294871794871795, "cat_recall_Manageme": 0.9666666666666667, "cat_f1_NoneOthe": 0.8949152542372881, "cat_prec_NoneOthe": 0.8301886792452831, "cat_recall_NoneOthe": 0.9705882352941176, "cat_f1_RiskMana": 0.8652849740932642, "cat_prec_RiskMana": 0.8882978723404256, "cat_recall_RiskMana": 0.8434343434343434, "cat_f1_Strategy": 0.9629629629629629, "cat_prec_Strategy": 0.985781990521327, "cat_recall_Strategy": 0.9411764705882353, "cat_f1_Third-Pa": 0.9590643274853801, "cat_prec_Third-Pa": 0.9939393939393939, "cat_recall_Third-Pa": 0.9265536723163842, "cat_kripp_alpha": 0.9272644584249223, "spec_macro_f1": 0.902152688639083, "spec_weighted_f1": 0.9177972939099285, "spec_macro_precision": 0.9070378979232232, "spec_macro_recall": 0.8991005681856252, "spec_mcc": 0.8753613597836426, "spec_auc": 0.9826044267990239, "spec_ece": 0.06921947295467064, "spec_confusion_matrix": [ [ 583, 17, 15, 3 ], [ 28, 130, 9, 1 ], [ 10, 3, 192, 2 ], [ 2, 1, 7, 197 ] ], "spec_f1_L1Generi": 0.9395648670427075, "spec_prec_L1Generi": 0.9357945425361156, "spec_recall_L1Generi": 0.9433656957928802, "spec_f1_L2Domain": 0.8150470219435737, "spec_prec_L2Domain": 0.8609271523178808, "spec_recall_L2Domain": 0.7738095238095238, "spec_f1_L3Firm-S": 0.8930232558139535, "spec_prec_L3Firm-S": 0.8609865470852018, "spec_recall_L3Firm-S": 0.927536231884058, "spec_f1_L4Quanti": 0.9609756097560975, "spec_prec_L4Quanti": 0.9704433497536946, "spec_recall_L4Quanti": 0.9516908212560387, "spec_qwk": 0.9338562415243872, "spec_mae": 0.1125, "spec_kripp_alpha": 0.9206308343112934, "total_time_s": 19.849480003875215, "num_samples": 1200, "avg_ms_per_sample": 16.54123333656268, "combined_macro_f1": 0.9202028639058946 }, "ensemble-3seed_vs_Opus-4.6": { "cat_macro_f1": 0.9287535853888995, "cat_weighted_f1": 0.9277067129478959, "cat_macro_precision": 0.9242877868683518, "cat_macro_recall": 0.9368327500295983, "cat_mcc": 0.9160728021840298, "cat_auc": 0.9947981532709612, "cat_ece": 0.06293055539329852, "cat_confusion_matrix": [ [ 211, 0, 1, 1, 1, 0, 0 ], [ 0, 78, 0, 0, 1, 0, 0 ], [ 8, 0, 145, 1, 3, 0, 1 ], [ 0, 0, 1, 139, 1, 0, 0 ], [ 13, 0, 8, 13, 173, 1, 5 ], [ 1, 10, 1, 4, 3, 209, 0 ], [ 0, 0, 0, 1, 6, 1, 159 ] ], "cat_f1_BoardGov": 0.9440715883668904, "cat_prec_BoardGov": 0.9055793991416309, "cat_recall_BoardGov": 0.985981308411215, "cat_f1_Incident": 0.9341317365269461, "cat_prec_Incident": 0.8863636363636364, "cat_recall_Incident": 0.9873417721518988, "cat_f1_Manageme": 0.9235668789808917, "cat_prec_Manageme": 0.9294871794871795, "cat_recall_Manageme": 0.9177215189873418, "cat_f1_NoneOthe": 0.9266666666666666, "cat_prec_NoneOthe": 0.8742138364779874, "cat_recall_NoneOthe": 0.9858156028368794, "cat_f1_RiskMana": 0.8628428927680798, "cat_prec_RiskMana": 0.9202127659574468, "cat_recall_RiskMana": 0.812206572769953, "cat_f1_Strategy": 0.9521640091116174, "cat_prec_Strategy": 0.990521327014218, "cat_recall_Strategy": 0.9166666666666666, "cat_f1_Third-Pa": 0.9578313253012049, "cat_prec_Third-Pa": 0.9636363636363636, "cat_recall_Third-Pa": 0.9520958083832335, "cat_kripp_alpha": 0.9154443888884335, "spec_macro_f1": 0.8852876459236954, "spec_weighted_f1": 0.9023972621736004, "spec_macro_precision": 0.888087338599951, "spec_macro_recall": 0.8858055716763026, "spec_mcc": 0.8535145242291756, "spec_auc": 0.9775733710374438, "spec_ece": 0.08450941021243728, "spec_confusion_matrix": [ [ 571, 24, 9, 1 ], [ 21, 118, 5, 1 ], [ 31, 9, 207, 13 ], [ 0, 0, 2, 188 ] ], "spec_f1_L1Generi": 0.9299674267100977, "spec_prec_L1Generi": 0.9165329052969502, "spec_recall_L1Generi": 0.943801652892562, "spec_f1_L2Domain": 0.7972972972972973, "spec_prec_L2Domain": 0.7814569536423841, "spec_recall_L2Domain": 0.8137931034482758, "spec_f1_L3Firm-S": 0.8571428571428571, "spec_prec_L3Firm-S": 0.9282511210762332, "spec_recall_L3Firm-S": 0.7961538461538461, "spec_f1_L4Quanti": 0.9567430025445293, "spec_prec_L4Quanti": 0.9261083743842364, "spec_recall_L4Quanti": 0.9894736842105263, "spec_qwk": 0.9247559136673115, "spec_mae": 0.1325, "spec_kripp_alpha": 0.910971486983108, "total_time_s": 19.849480003875215, "num_samples": 1200, "avg_ms_per_sample": 16.54123333656268, "combined_macro_f1": 0.9070206156562974 } }