"""Compare v3.0 vs v3.5 annotations on 359 confusion-axis holdout paragraphs.""" import json from collections import Counter, defaultdict from pathlib import Path import numpy as np # ── Paths ────────────────────────────────────────────────────────────────────── ROOT = Path(__file__).resolve().parent.parent V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl" V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl" V30_BENCH = ROOT / "data/annotations/bench-holdout" V35_BENCH = ROOT / "data/annotations/bench-holdout-v35" HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl" HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl" MODEL_FILES = [ "opus.jsonl", # golden dirs "gpt-5.4.jsonl", "gemini-3.1-pro-preview.jsonl", "glm-5:exacto.jsonl", "kimi-k2.5.jsonl", "mimo-v2-pro:exacto.jsonl", "minimax-m2.7:exacto.jsonl", ] MODEL_NAMES = [ "Opus", "GPT-5.4", "Gemini-3.1-Pro", "GLM-5", "Kimi-K2.5", "Mimo-v2-Pro", "MiniMax-M2.7", ] # Category abbreviations used in axes CAT_ABBREV = { "BG": "Board Governance", "MR": "Management Role", "RMP": "Risk Management Process", "SI": "Strategy Integration", "NO": "None/Other", "ID": "Incident Disclosure", "TPR": "Third-Party Risk", } ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()} def abbrev(cat: str) -> str: return ABBREV_CAT.get(cat, cat) def full_cat(ab: str) -> str: return CAT_ABBREV.get(ab, ab) # ── Load data ────────────────────────────────────────────────────────────────── def load_jsonl(path: Path) -> list[dict]: with open(path) as f: return [json.loads(line) for line in f if line.strip()] def load_annotations(base_dir: Path, filename: str) -> dict[str, str]: """Load paragraphId → content_category mapping.""" path = base_dir / filename records = load_jsonl(path) return {r["paragraphId"]: r["label"]["content_category"] for r in records} def load_golden(path: Path) -> dict[str, str]: records = load_jsonl(path) return {r["paragraphId"]: r["label"]["content_category"] for r in records} # Load holdout metadata holdout_records = load_jsonl(HOLDOUT_META) holdout_pids = {r["paragraphId"] for r in holdout_records} pid_axes = {r["paragraphId"]: r["axes"] for r in holdout_records} pid_materiality = {r["paragraphId"]: r.get("hasMaterialityLanguage", False) for r in holdout_records} assert len(holdout_pids) == 359, f"Expected 359 holdout PIDs, got {len(holdout_pids)}" # Load v3.0 annotations per model (filtered to 359 holdout PIDs) v30: dict[str, dict[str, str]] = {} # model_name → {pid → category} v35: dict[str, dict[str, str]] = {} for i, (fname, mname) in enumerate(zip(MODEL_FILES, MODEL_NAMES)): if fname == "opus.jsonl": v30_all = load_golden(V30_GOLDEN) v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all} v35[mname] = load_golden(V35_GOLDEN) else: v30_all = load_annotations(V30_BENCH, fname) v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all} v35[mname] = load_annotations(V35_BENCH, fname) # Load human labels human_raw = load_jsonl(HUMAN_LABELS) # Group by paragraphId, compute majority human_labels_by_pid: dict[str, list[str]] = defaultdict(list) for rec in human_raw: human_labels_by_pid[rec["paragraphId"]].append(rec["contentCategory"]) human_majority: dict[str, str] = {} for pid, labels in human_labels_by_pid.items(): counts = Counter(labels) human_majority[pid] = counts.most_common(1)[0][0] # Axes grouping axis_pids: dict[str, set[str]] = defaultdict(set) for pid, axes in pid_axes.items(): for ax in axes: axis_pids[ax].add(pid) AXIS_LABELS = { "SI_NO": "SI↔N/O", "MR_RMP": "MR↔RMP", "BG_MR": "BG↔MR", "BG_RMP": "BG↔RMP", } # ── Helpers ──────────────────────────────────────────────────────────────────── def majority_vote(model_cats: dict[str, dict[str, str]], pid: str) -> str | None: """Get majority category across all models for a PID.""" votes = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]] votes = [v for v in votes if v is not None] if not votes: return None counts = Counter(votes) return counts.most_common(1)[0][0] def agreement_rate(model_cats: dict[str, dict[str, str]], pids: set[str]) -> float: """Average pairwise agreement among 7 models on given PIDs.""" total_pairs = 0 agree_pairs = 0 for pid in pids: cats = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]] cats = [c for c in cats if c is not None] n = len(cats) for i in range(n): for j in range(i + 1, n): total_pairs += 1 if cats[i] == cats[j]: agree_pairs += 1 return agree_pairs / total_pairs if total_pairs > 0 else 0.0 def pairwise_agreement_matrix(model_cats: dict[str, dict[str, str]], pids: set[str]) -> np.ndarray: """Return 7x7 pairwise agreement matrix.""" n = len(MODEL_NAMES) mat = np.zeros((n, n)) for i in range(n): for j in range(n): if i == j: mat[i, j] = 1.0 continue agree = 0 total = 0 for pid in pids: ci = model_cats[MODEL_NAMES[i]].get(pid) cj = model_cats[MODEL_NAMES[j]].get(pid) if ci is not None and cj is not None: total += 1 if ci == cj: agree += 1 mat[i, j] = agree / total if total > 0 else 0.0 return mat # ── Section 1: Per-model category change rate ───────────────────────────────── print("=" * 80) print("1. PER-MODEL CATEGORY CHANGE RATE (v3.0 → v3.5)") print("=" * 80) print() header = f"{'Model':<18} {'Changed':>8} {'Total':>6} {'% Changed':>10}" print(header) print("-" * len(header)) for mname in MODEL_NAMES: changed = 0 total = 0 for pid in holdout_pids: c30 = v30[mname].get(pid) c35 = v35[mname].get(pid) if c30 is not None and c35 is not None: total += 1 if c30 != c35: changed += 1 pct = (changed / total * 100) if total > 0 else 0 print(f"{mname:<18} {changed:>8} {total:>6} {pct:>9.1f}%") print() # Top transitions per model print("Top category transitions per model:") print() for mname in MODEL_NAMES: transitions: Counter = Counter() for pid in holdout_pids: c30 = v30[mname].get(pid) c35 = v35[mname].get(pid) if c30 is not None and c35 is not None and c30 != c35: transitions[(abbrev(c30), abbrev(c35))] += 1 if transitions: top = transitions.most_common(5) parts = [f"{a}→{b} ({n})" for (a, b), n in top] print(f" {mname:<18} {', '.join(parts)}") print() # ── Section 2: Per-axis resolution analysis ─────────────────────────────────── print("=" * 80) print("2. PER-AXIS RESOLUTION ANALYSIS") print("=" * 80) print() for axis_key, axis_label in AXIS_LABELS.items(): pids_on_axis = axis_pids[axis_key] cat_a, cat_b = axis_key.split("_") print(f"--- {axis_label} ({len(pids_on_axis)} paragraphs) ---") print() # v3.0 and v3.5 majorities v30_maj = {pid: majority_vote(v30, pid) for pid in pids_on_axis} v35_maj = {pid: majority_vote(v35, pid) for pid in pids_on_axis} # Majority distribution v30_dist = Counter(v for v in v30_maj.values() if v) v35_dist = Counter(v for v in v35_maj.values() if v) print(f" v3.0 majority distribution: ", end="") print(", ".join(f"{abbrev(k)}={v}" for k, v in v30_dist.most_common())) print(f" v3.5 majority distribution: ", end="") print(", ".join(f"{abbrev(k)}={v}" for k, v in v35_dist.most_common())) # Flipped majority flipped = sum( 1 for pid in pids_on_axis if v30_maj.get(pid) and v35_maj.get(pid) and v30_maj[pid] != v35_maj[pid] ) print(f" Paragraphs with flipped majority: {flipped}/{len(pids_on_axis)} ({flipped / len(pids_on_axis) * 100:.1f}%)") # New agreement rate (7-model) v30_agree = agreement_rate(v30, pids_on_axis) v35_agree = agreement_rate(v35, pids_on_axis) print(f" 7-model avg pairwise agreement: v3.0={v30_agree:.3f} → v3.5={v35_agree:.3f} (Δ={v35_agree - v30_agree:+.3f})") print() # ── Section 3: Human alignment improvement ─────────────────────────────────── print("=" * 80) print("3. HUMAN ALIGNMENT IMPROVEMENT") print("=" * 80) print() # Overall pids_with_human = holdout_pids & set(human_majority.keys()) v30_agree_human = 0 v35_agree_human = 0 total_human = 0 for pid in pids_with_human: hm = human_majority[pid] m30 = majority_vote(v30, pid) m35 = majority_vote(v35, pid) if m30 is not None and m35 is not None: total_human += 1 if m30 == hm: v30_agree_human += 1 if m35 == hm: v35_agree_human += 1 v30_pct = v30_agree_human / total_human * 100 if total_human else 0 v35_pct = v35_agree_human / total_human * 100 if total_human else 0 print(f"Overall (n={total_human}):") print(f" v3.0 GenAI majority vs human majority: {v30_agree_human}/{total_human} ({v30_pct:.1f}%)") print(f" v3.5 GenAI majority vs human majority: {v35_agree_human}/{total_human} ({v35_pct:.1f}%)") print(f" Delta: {v35_pct - v30_pct:+.1f}pp") print() # By axis print("By axis:") header = f"{'Axis':<12} {'n':>4} {'v3.0 %':>8} {'v3.5 %':>8} {'Delta':>8}" print(header) print("-" * len(header)) for axis_key, axis_label in AXIS_LABELS.items(): pids_ax = axis_pids[axis_key] & pids_with_human a30 = 0 a35 = 0 tot = 0 for pid in pids_ax: hm = human_majority[pid] m30 = majority_vote(v30, pid) m35 = majority_vote(v35, pid) if m30 is not None and m35 is not None: tot += 1 if m30 == hm: a30 += 1 if m35 == hm: a35 += 1 p30 = a30 / tot * 100 if tot else 0 p35 = a35 / tot * 100 if tot else 0 print(f"{axis_label:<12} {tot:>4} {p30:>7.1f}% {p35:>7.1f}% {p35 - p30:>+7.1f}pp") print() # ── Section 4: SI↔N/O specific analysis ────────────────────────────────────── print("=" * 80) print("4. SI↔N/O SPECIFIC ANALYSIS") print("=" * 80) print() si_no_pids = axis_pids["SI_NO"] print(f"Paragraphs on SI↔N/O axis: {len(si_no_pids)}") print() # Per-model SI call rate print("Per-model SI call rate:") header = f"{'Model':<18} {'v3.0 SI':>8} {'v3.0 NO':>8} {'v3.5 SI':>8} {'v3.5 NO':>8} {'v3.0 SI%':>9} {'v3.5 SI%':>9}" print(header) print("-" * len(header)) for mname in MODEL_NAMES: si30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "Strategy Integration") no30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "None/Other") si35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "Strategy Integration") no35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "None/Other") tot30 = si30 + no30 if (si30 + no30) > 0 else 1 tot35 = si35 + no35 if (si35 + no35) > 0 else 1 pct30 = si30 / len(si_no_pids) * 100 pct35 = si35 / len(si_no_pids) * 100 print(f"{mname:<18} {si30:>8} {no30:>8} {si35:>8} {no35:>8} {pct30:>8.1f}% {pct35:>8.1f}%") print() # N/O → SI switches per model print("Models switching N/O → SI on SI↔N/O paragraphs:") for mname in MODEL_NAMES: switches = sum( 1 for pid in si_no_pids if v30[mname].get(pid) == "None/Other" and v35[mname].get(pid) == "Strategy Integration" ) reverse = sum( 1 for pid in si_no_pids if v30[mname].get(pid) == "Strategy Integration" and v35[mname].get(pid) == "None/Other" ) print(f" {mname:<18} N/O→SI: {switches:>3}, SI→N/O: {reverse:>3}") print() # Per-paragraph tally shift print("Per-paragraph SI vs N/O tally (v3.0 → v3.5), showing shifts:") print() header = f"{'ParagraphId':<38} {'v3.0 SI':>7} {'v3.0 NO':>7} {'v3.5 SI':>7} {'v3.5 NO':>7} {'Human':>6} {'Resolved?':>10}" print(header) print("-" * len(header)) resolved_count = 0 total_si_no_with_human = 0 for pid in sorted(si_no_pids): si30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "Strategy Integration") no30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "None/Other") si35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "Strategy Integration") no35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "None/Other") hm = human_majority.get(pid, "?") hm_ab = abbrev(hm) if hm != "?" else "?" # "Resolved" = v3.5 majority matches human majority v35_maj = "SI" if si35 > no35 else ("NO" if no35 > si35 else "TIE") resolved = "YES" if hm_ab == v35_maj else ("" if hm == "?" else "no") if hm != "?": total_si_no_with_human += 1 if hm_ab == v35_maj: resolved_count += 1 print(f"{pid[:36]:<38} {si30:>7} {no30:>7} {si35:>7} {no35:>7} {hm_ab:>6} {resolved:>10}") print() print(f"SI↔N/O resolution rate (v3.5 majority matches human): {resolved_count}/{total_si_no_with_human} ({resolved_count / total_si_no_with_human * 100:.1f}%)" if total_si_no_with_human else "No human labels for SI↔N/O paragraphs") # 23:0 asymmetry check print() print("23:0 asymmetry check:") # In v3.0, how many SI↔N/O paragraphs had human=SI but GenAI majority=N/O? asym_30 = sum( 1 for pid in si_no_pids if human_majority.get(pid) == "Strategy Integration" and majority_vote(v30, pid) == "None/Other" ) asym_35 = sum( 1 for pid in si_no_pids if human_majority.get(pid) == "Strategy Integration" and majority_vote(v35, pid) == "None/Other" ) print(f" v3.0: Human=SI but GenAI majority=N/O: {asym_30}") print(f" v3.5: Human=SI but GenAI majority=N/O: {asym_35}") rev_30 = sum( 1 for pid in si_no_pids if human_majority.get(pid) == "None/Other" and majority_vote(v30, pid) == "Strategy Integration" ) rev_35 = sum( 1 for pid in si_no_pids if human_majority.get(pid) == "None/Other" and majority_vote(v35, pid) == "Strategy Integration" ) print(f" v3.0: Human=N/O but GenAI majority=SI: {rev_30}") print(f" v3.5: Human=N/O but GenAI majority=SI: {rev_35}") print() # ── Section 5: Per-model quality on confusion axes ─────────────────────────── print("=" * 80) print("5. PER-MODEL ACCURACY ON CONFUSION-AXIS PARAGRAPHS (vs human majority)") print("=" * 80) print() model_results = [] for mname in MODEL_NAMES: correct_30 = 0 correct_35 = 0 total = 0 for pid in holdout_pids: hm = human_majority.get(pid) c30 = v30[mname].get(pid) c35 = v35[mname].get(pid) if hm and c30 and c35: total += 1 if c30 == hm: correct_30 += 1 if c35 == hm: correct_35 += 1 acc30 = correct_30 / total * 100 if total else 0 acc35 = correct_35 / total * 100 if total else 0 model_results.append((mname, total, acc30, acc35, acc35 - acc30)) # Sort by v3.5 accuracy descending model_results.sort(key=lambda x: -x[3]) header = f"{'Rank':>4} {'Model':<18} {'n':>5} {'v3.0 Acc':>9} {'v3.5 Acc':>9} {'Delta':>8}" print(header) print("-" * len(header)) for rank, (mname, total, acc30, acc35, delta) in enumerate(model_results, 1): print(f"{rank:>4} {mname:<18} {total:>5} {acc30:>8.1f}% {acc35:>8.1f}% {delta:>+7.1f}pp") print() # ── Section 6: Model convergence ───────────────────────────────────────────── print("=" * 80) print("6. MODEL CONVERGENCE (pairwise agreement)") print("=" * 80) print() v30_avg = agreement_rate(v30, holdout_pids) v35_avg = agreement_rate(v35, holdout_pids) print(f"Average pairwise agreement among 7 models:") print(f" v3.0: {v30_avg:.3f}") print(f" v3.5: {v35_avg:.3f}") print(f" Delta: {v35_avg - v30_avg:+.3f}") print() # Per-model average agreement with others print("Per-model average agreement with other 6 models:") header = f"{'Model':<18} {'v3.0':>8} {'v3.5':>8} {'Delta':>8}" print(header) print("-" * len(header)) v30_mat = pairwise_agreement_matrix(v30, holdout_pids) v35_mat = pairwise_agreement_matrix(v35, holdout_pids) for i, mname in enumerate(MODEL_NAMES): # Average agreement with other models (exclude self) others_30 = [v30_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i] others_35 = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i] avg30 = np.mean(others_30) avg35 = np.mean(others_35) print(f"{mname:<18} {avg30:>7.3f} {avg35:>7.3f} {avg35 - avg30:>+7.3f}") print() # Outlier detection print("Outlier check (models with lowest v3.5 agreement):") v35_avgs = [] for i, mname in enumerate(MODEL_NAMES): others = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i] v35_avgs.append((mname, np.mean(others))) v35_avgs.sort(key=lambda x: x[1]) mean_agree = np.mean([x[1] for x in v35_avgs]) std_agree = np.std([x[1] for x in v35_avgs]) for mname, avg in v35_avgs: z = (avg - mean_agree) / std_agree if std_agree > 0 else 0 flag = " *** OUTLIER" if z < -1.5 else "" print(f" {mname:<18} {avg:.3f} (z={z:+.2f}){flag}") print() print("=" * 80) print("DONE") print("=" * 80)