""" Comprehensive comparison of v3.0 vs v3.5f prompt on the 359 confusion-axis holdout paragraphs. Covers per-model accuracy, per-axis breakdown, SI/NO asymmetry, rankings, convergence, and cost. """ import json from collections import Counter from pathlib import Path from itertools import combinations import numpy as np ROOT = Path("/home/joey/Documents/sec-cyBERT") # --------------------------------------------------------------------------- # Model definitions # --------------------------------------------------------------------------- MODELS = [ ("Opus", "golden", "opus"), ("GPT-5.4", "bench-holdout", "gpt-5.4"), ("Gemini-3.1-Pro", "bench-holdout", "gemini-3.1-pro-preview"), ("GLM-5", "bench-holdout", "glm-5:exacto"), ("Kimi-K2.5", "bench-holdout", "kimi-k2.5"), ("MIMO-v2-Pro", "bench-holdout", "mimo-v2-pro:exacto"), ("MiniMax-M2.7", "bench-holdout", "minimax-m2.7:exacto"), ] CATEGORY_ABBREV = { "None/Other": "N/O", "Background": "BG", "Risk Management Process": "RMP", "Management Role": "MR", "Strategy Integration": "SI", } def abbrev(cat: str) -> str: return CATEGORY_ABBREV.get(cat, cat) # --------------------------------------------------------------------------- # Data loading # --------------------------------------------------------------------------- def load_jsonl(path: Path) -> list[dict]: rows = [] with open(path) as f: for line in f: line = line.strip() if line: rows.append(json.loads(line)) return rows def load_model_labels(version_suffix: str, subdir: str, filename: str) -> dict[str, str]: """Return {paragraphId: content_category} for a model file.""" if version_suffix: base = ROOT / "data" / "annotations" / f"{subdir}-{version_suffix}" / f"{filename}.jsonl" else: base = ROOT / "data" / "annotations" / subdir / f"{filename}.jsonl" rows = load_jsonl(base) return {r["paragraphId"]: r["label"]["content_category"] for r in rows} def load_model_rows(version_suffix: str, subdir: str, filename: str) -> list[dict]: if version_suffix: base = ROOT / "data" / "annotations" / f"{subdir}-{version_suffix}" / f"{filename}.jsonl" else: base = ROOT / "data" / "annotations" / subdir / f"{filename}.jsonl" return load_jsonl(base) # Load holdout PIDs and axes holdout_rows = load_jsonl(ROOT / "data" / "gold" / "holdout-rerun-v35.jsonl") HOLDOUT_PIDS = {r["paragraphId"] for r in holdout_rows} PID_AXES: dict[str, list[str]] = {r["paragraphId"]: r["axes"] for r in holdout_rows} # Human labels → majority vote per PID human_raw = load_jsonl(ROOT / "data" / "gold" / "human-labels-raw.jsonl") human_by_pid: dict[str, list[str]] = {} for row in human_raw: pid = row["paragraphId"] if pid in HOLDOUT_PIDS: human_by_pid.setdefault(pid, []).append(row["contentCategory"]) human_majority: dict[str, str] = {} for pid, cats in human_by_pid.items(): counter = Counter(cats) human_majority[pid] = counter.most_common(1)[0][0] # Load v3.0 and v3.5f labels for all models v30_labels: dict[str, dict[str, str]] = {} # model_name -> {pid: cat} v35_labels: dict[str, dict[str, str]] = {} v35_rows_by_model: dict[str, list[dict]] = {} for name, subdir, filename in MODELS: # v3.0: full 1200 file, filter to 359 all_v30 = load_model_labels("", subdir, filename) v30_labels[name] = {pid: cat for pid, cat in all_v30.items() if pid in HOLDOUT_PIDS} # v3.5f suffix = "v35" sub = f"golden" if subdir == "golden" else "bench-holdout" v35_all = load_model_labels(suffix, sub, filename) v35_labels[name] = {pid: cat for pid, cat in v35_all.items() if pid in HOLDOUT_PIDS} v35_rows_by_model[name] = load_model_rows(suffix, sub, filename) # Common PID set (intersection of all models in both versions + human majority) common_pids = set(HOLDOUT_PIDS) for name in [m[0] for m in MODELS]: common_pids &= set(v30_labels[name].keys()) common_pids &= set(v35_labels[name].keys()) common_pids &= set(human_majority.keys()) common_pids_sorted = sorted(common_pids) N = len(common_pids_sorted) print(f"Common paragraphs across all models + human majority: {N}") print() # --------------------------------------------------------------------------- # Helper: 6-model majority (excl MiniMax) # --------------------------------------------------------------------------- TOP6_NAMES = [m[0] for m in MODELS if m[0] != "MiniMax-M2.7"] def majority_vote(labels_dict: dict[str, dict[str, str]], model_names: list[str], pid: str) -> str | None: cats = [] for mn in model_names: if pid in labels_dict[mn]: cats.append(labels_dict[mn][pid]) if not cats: return None counter = Counter(cats) return counter.most_common(1)[0][0] # =========================================================================== # 1. Per-model summary table # =========================================================================== print("=" * 90) print("1. PER-MODEL SUMMARY TABLE (vs human majority)") print("=" * 90) header = f"{'Model':<20} {'v3.0 Acc':>10} {'v3.5f Acc':>10} {'Delta':>8} {'Change%':>9}" print(header) print("-" * len(header)) model_v30_acc = {} model_v35_acc = {} for name, _, _ in MODELS: correct_30 = sum(1 for pid in common_pids_sorted if v30_labels[name][pid] == human_majority[pid]) correct_35 = sum(1 for pid in common_pids_sorted if v35_labels[name][pid] == human_majority[pid]) changed = sum(1 for pid in common_pids_sorted if v30_labels[name][pid] != v35_labels[name][pid]) acc30 = correct_30 / N acc35 = correct_35 / N delta = acc35 - acc30 change_rate = changed / N model_v30_acc[name] = acc30 model_v35_acc[name] = acc35 print(f"{name:<20} {acc30:>9.1%} {acc35:>9.1%} {delta:>+7.1%} {change_rate:>8.1%}") # 6-model majority row correct_30_maj = 0 correct_35_maj = 0 changed_maj = 0 for pid in common_pids_sorted: m30 = majority_vote(v30_labels, TOP6_NAMES, pid) m35 = majority_vote(v35_labels, TOP6_NAMES, pid) if m30 == human_majority[pid]: correct_30_maj += 1 if m35 == human_majority[pid]: correct_35_maj += 1 if m30 != m35: changed_maj += 1 acc30_maj = correct_30_maj / N acc35_maj = correct_35_maj / N delta_maj = acc35_maj - acc30_maj change_maj_rate = changed_maj / N model_v30_acc["6-model majority"] = acc30_maj model_v35_acc["6-model majority"] = acc35_maj print("-" * len(header)) print(f"{'6-model maj (no MM)':<20} {acc30_maj:>9.1%} {acc35_maj:>9.1%} {delta_maj:>+7.1%} {change_maj_rate:>8.1%}") print() # =========================================================================== # 2. Per-axis breakdown (6-model majority excl MiniMax) # =========================================================================== print("=" * 90) print("2. PER-AXIS BREAKDOWN (6-model majority excl MiniMax vs human majority)") print("=" * 90) all_axes = sorted({ax for axes in PID_AXES.values() for ax in axes}) header2 = f"{'Axis':<12} {'N':>5} {'v3.0 Acc':>10} {'v3.5f Acc':>10} {'Delta':>8}" print(header2) print("-" * len(header2)) for axis in all_axes: axis_pids = [pid for pid in common_pids_sorted if axis in PID_AXES.get(pid, [])] n_axis = len(axis_pids) if n_axis == 0: continue correct_30 = sum(1 for pid in axis_pids if majority_vote(v30_labels, TOP6_NAMES, pid) == human_majority[pid]) correct_35 = sum(1 for pid in axis_pids if majority_vote(v35_labels, TOP6_NAMES, pid) == human_majority[pid]) a30 = correct_30 / n_axis a35 = correct_35 / n_axis d = a35 - a30 print(f"{axis:<12} {n_axis:>5} {a30:>9.1%} {a35:>9.1%} {d:>+7.1%}") print() # =========================================================================== # 3. SI ↔ N/O asymmetry check # =========================================================================== print("=" * 90) print("3. SI <-> N/O ASYMMETRY CHECK") print("=" * 90) si_no_pids = [pid for pid in common_pids_sorted if "SI_NO" in PID_AXES.get(pid, [])] print(f"SI↔N/O paragraphs in common set: {len(si_no_pids)}") print() for version_label, labels_dict in [("v3.0", v30_labels), ("v3.5f", v35_labels)]: human_si_model_no = 0 human_no_model_si = 0 for pid in si_no_pids: h = human_majority[pid] m = majority_vote(labels_dict, TOP6_NAMES, pid) if h == "Strategy Integration" and m == "None/Other": human_si_model_no += 1 elif h == "None/Other" and m == "Strategy Integration": human_no_model_si += 1 print(f"{version_label}:") print(f" Human=SI, 6-model=N/O: {human_si_model_no}") print(f" Human=N/O, 6-model=SI: {human_no_model_si}") print() # Also show per-model breakdown for SI↔N/O print("Per-model SI↔N/O errors:") header3 = f"{'Model':<20} {'v3.0 H=SI,M=NO':>16} {'v3.0 H=NO,M=SI':>16} {'v3.5 H=SI,M=NO':>16} {'v3.5 H=NO,M=SI':>16}" print(header3) print("-" * len(header3)) for name, _, _ in MODELS: counts = [] for labels_dict in [v30_labels, v35_labels]: hsi_mno = 0 hno_msi = 0 for pid in si_no_pids: h = human_majority[pid] m = labels_dict[name].get(pid) if m is None: continue if h == "Strategy Integration" and m == "None/Other": hsi_mno += 1 elif h == "None/Other" and m == "Strategy Integration": hno_msi += 1 counts.extend([hsi_mno, hno_msi]) print(f"{name:<20} {counts[0]:>16} {counts[1]:>16} {counts[2]:>16} {counts[3]:>16}") print() # =========================================================================== # 4. Per-model ranking # =========================================================================== print("=" * 90) print("4. PER-MODEL RANKING") print("=" * 90) all_names = [m[0] for m in MODELS] rank_v30 = sorted(all_names, key=lambda n: model_v30_acc[n], reverse=True) rank_v35 = sorted(all_names, key=lambda n: model_v35_acc[n], reverse=True) header4 = f"{'Rank':>4} {'v3.0 Model':<20} {'Acc':>8} {'v3.5f Model':<20} {'Acc':>8}" print(header4) print("-" * len(header4)) for i in range(len(all_names)): n30 = rank_v30[i] n35 = rank_v35[i] print(f"{i+1:>4} {n30:<20} {model_v30_acc[n30]:>7.1%} {n35:<20} {model_v35_acc[n35]:>7.1%}") print() # =========================================================================== # 5. Model convergence (average pairwise agreement) # =========================================================================== print("=" * 90) print("5. MODEL CONVERGENCE (average pairwise agreement)") print("=" * 90) def avg_pairwise_agreement(labels_dict: dict[str, dict[str, str]], model_names: list[str], pids: list[str]) -> float: agreements = [] for m1, m2 in combinations(model_names, 2): agree = sum(1 for pid in pids if labels_dict[m1].get(pid) == labels_dict[m2].get(pid)) agreements.append(agree / len(pids)) return float(np.mean(agreements)) for group_label, group_names in [("All 7 models", all_names), ("Top 6 (excl MiniMax)", TOP6_NAMES)]: a30 = avg_pairwise_agreement(v30_labels, group_names, common_pids_sorted) a35 = avg_pairwise_agreement(v35_labels, group_names, common_pids_sorted) delta = a35 - a30 print(f"{group_label}:") print(f" v3.0 avg pairwise agreement: {a30:.1%}") print(f" v3.5f avg pairwise agreement: {a35:.1%}") print(f" Delta: {delta:+.1%}") print() # =========================================================================== # 6. Cost summary # =========================================================================== print("=" * 90) print("6. v3.5f RE-RUN COST SUMMARY") print("=" * 90) total_cost = 0.0 header6 = f"{'Model':<20} {'Records':>8} {'Cost ($)':>10}" print(header6) print("-" * len(header6)) for name, _, _ in MODELS: rows = v35_rows_by_model[name] cost = sum(r.get("provenance", {}).get("costUsd", 0) for r in rows) total_cost += cost print(f"{name:<20} {len(rows):>8} {cost:>10.4f}") print("-" * len(header6)) print(f"{'TOTAL':<20} {'':<8} {total_cost:>10.4f}") print()