SEC-cyBERT/scripts/compare-v30-v35.py
2026-04-03 14:43:53 -04:00

519 lines
18 KiB
Python

"""Compare v3.0 vs v3.5 annotations on 359 confusion-axis holdout paragraphs."""
import json
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
# ── Paths ──────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent
V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl"
V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl"
V30_BENCH = ROOT / "data/annotations/bench-holdout"
V35_BENCH = ROOT / "data/annotations/bench-holdout-v35"
HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl"
HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl"
MODEL_FILES = [
"opus.jsonl", # golden dirs
"gpt-5.4.jsonl",
"gemini-3.1-pro-preview.jsonl",
"glm-5:exacto.jsonl",
"kimi-k2.5.jsonl",
"mimo-v2-pro:exacto.jsonl",
"minimax-m2.7:exacto.jsonl",
]
MODEL_NAMES = [
"Opus",
"GPT-5.4",
"Gemini-3.1-Pro",
"GLM-5",
"Kimi-K2.5",
"Mimo-v2-Pro",
"MiniMax-M2.7",
]
# Category abbreviations used in axes
CAT_ABBREV = {
"BG": "Board Governance",
"MR": "Management Role",
"RMP": "Risk Management Process",
"SI": "Strategy Integration",
"NO": "None/Other",
"ID": "Incident Disclosure",
"TPR": "Third-Party Risk",
}
ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()}
def abbrev(cat: str) -> str:
return ABBREV_CAT.get(cat, cat)
def full_cat(ab: str) -> str:
return CAT_ABBREV.get(ab, ab)
# ── Load data ──────────────────────────────────────────────────────────────────
def load_jsonl(path: Path) -> list[dict]:
with open(path) as f:
return [json.loads(line) for line in f if line.strip()]
def load_annotations(base_dir: Path, filename: str) -> dict[str, str]:
"""Load paragraphId → content_category mapping."""
path = base_dir / filename
records = load_jsonl(path)
return {r["paragraphId"]: r["label"]["content_category"] for r in records}
def load_golden(path: Path) -> dict[str, str]:
records = load_jsonl(path)
return {r["paragraphId"]: r["label"]["content_category"] for r in records}
# Load holdout metadata
holdout_records = load_jsonl(HOLDOUT_META)
holdout_pids = {r["paragraphId"] for r in holdout_records}
pid_axes = {r["paragraphId"]: r["axes"] for r in holdout_records}
pid_materiality = {r["paragraphId"]: r.get("hasMaterialityLanguage", False) for r in holdout_records}
assert len(holdout_pids) == 359, f"Expected 359 holdout PIDs, got {len(holdout_pids)}"
# Load v3.0 annotations per model (filtered to 359 holdout PIDs)
v30: dict[str, dict[str, str]] = {} # model_name → {pid → category}
v35: dict[str, dict[str, str]] = {}
for i, (fname, mname) in enumerate(zip(MODEL_FILES, MODEL_NAMES)):
if fname == "opus.jsonl":
v30_all = load_golden(V30_GOLDEN)
v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all}
v35[mname] = load_golden(V35_GOLDEN)
else:
v30_all = load_annotations(V30_BENCH, fname)
v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all}
v35[mname] = load_annotations(V35_BENCH, fname)
# Load human labels
human_raw = load_jsonl(HUMAN_LABELS)
# Group by paragraphId, compute majority
human_labels_by_pid: dict[str, list[str]] = defaultdict(list)
for rec in human_raw:
human_labels_by_pid[rec["paragraphId"]].append(rec["contentCategory"])
human_majority: dict[str, str] = {}
for pid, labels in human_labels_by_pid.items():
counts = Counter(labels)
human_majority[pid] = counts.most_common(1)[0][0]
# Axes grouping
axis_pids: dict[str, set[str]] = defaultdict(set)
for pid, axes in pid_axes.items():
for ax in axes:
axis_pids[ax].add(pid)
AXIS_LABELS = {
"SI_NO": "SI↔N/O",
"MR_RMP": "MR↔RMP",
"BG_MR": "BG↔MR",
"BG_RMP": "BG↔RMP",
}
# ── Helpers ────────────────────────────────────────────────────────────────────
def majority_vote(model_cats: dict[str, dict[str, str]], pid: str) -> str | None:
"""Get majority category across all models for a PID."""
votes = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]]
votes = [v for v in votes if v is not None]
if not votes:
return None
counts = Counter(votes)
return counts.most_common(1)[0][0]
def agreement_rate(model_cats: dict[str, dict[str, str]], pids: set[str]) -> float:
"""Average pairwise agreement among 7 models on given PIDs."""
total_pairs = 0
agree_pairs = 0
for pid in pids:
cats = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]]
cats = [c for c in cats if c is not None]
n = len(cats)
for i in range(n):
for j in range(i + 1, n):
total_pairs += 1
if cats[i] == cats[j]:
agree_pairs += 1
return agree_pairs / total_pairs if total_pairs > 0 else 0.0
def pairwise_agreement_matrix(model_cats: dict[str, dict[str, str]], pids: set[str]) -> np.ndarray:
"""Return 7x7 pairwise agreement matrix."""
n = len(MODEL_NAMES)
mat = np.zeros((n, n))
for i in range(n):
for j in range(n):
if i == j:
mat[i, j] = 1.0
continue
agree = 0
total = 0
for pid in pids:
ci = model_cats[MODEL_NAMES[i]].get(pid)
cj = model_cats[MODEL_NAMES[j]].get(pid)
if ci is not None and cj is not None:
total += 1
if ci == cj:
agree += 1
mat[i, j] = agree / total if total > 0 else 0.0
return mat
# ── Section 1: Per-model category change rate ─────────────────────────────────
print("=" * 80)
print("1. PER-MODEL CATEGORY CHANGE RATE (v3.0 → v3.5)")
print("=" * 80)
print()
header = f"{'Model':<18} {'Changed':>8} {'Total':>6} {'% Changed':>10}"
print(header)
print("-" * len(header))
for mname in MODEL_NAMES:
changed = 0
total = 0
for pid in holdout_pids:
c30 = v30[mname].get(pid)
c35 = v35[mname].get(pid)
if c30 is not None and c35 is not None:
total += 1
if c30 != c35:
changed += 1
pct = (changed / total * 100) if total > 0 else 0
print(f"{mname:<18} {changed:>8} {total:>6} {pct:>9.1f}%")
print()
# Top transitions per model
print("Top category transitions per model:")
print()
for mname in MODEL_NAMES:
transitions: Counter = Counter()
for pid in holdout_pids:
c30 = v30[mname].get(pid)
c35 = v35[mname].get(pid)
if c30 is not None and c35 is not None and c30 != c35:
transitions[(abbrev(c30), abbrev(c35))] += 1
if transitions:
top = transitions.most_common(5)
parts = [f"{a}{b} ({n})" for (a, b), n in top]
print(f" {mname:<18} {', '.join(parts)}")
print()
# ── Section 2: Per-axis resolution analysis ───────────────────────────────────
print("=" * 80)
print("2. PER-AXIS RESOLUTION ANALYSIS")
print("=" * 80)
print()
for axis_key, axis_label in AXIS_LABELS.items():
pids_on_axis = axis_pids[axis_key]
cat_a, cat_b = axis_key.split("_")
print(f"--- {axis_label} ({len(pids_on_axis)} paragraphs) ---")
print()
# v3.0 and v3.5 majorities
v30_maj = {pid: majority_vote(v30, pid) for pid in pids_on_axis}
v35_maj = {pid: majority_vote(v35, pid) for pid in pids_on_axis}
# Majority distribution
v30_dist = Counter(v for v in v30_maj.values() if v)
v35_dist = Counter(v for v in v35_maj.values() if v)
print(f" v3.0 majority distribution: ", end="")
print(", ".join(f"{abbrev(k)}={v}" for k, v in v30_dist.most_common()))
print(f" v3.5 majority distribution: ", end="")
print(", ".join(f"{abbrev(k)}={v}" for k, v in v35_dist.most_common()))
# Flipped majority
flipped = sum(
1 for pid in pids_on_axis
if v30_maj.get(pid) and v35_maj.get(pid) and v30_maj[pid] != v35_maj[pid]
)
print(f" Paragraphs with flipped majority: {flipped}/{len(pids_on_axis)} ({flipped / len(pids_on_axis) * 100:.1f}%)")
# New agreement rate (7-model)
v30_agree = agreement_rate(v30, pids_on_axis)
v35_agree = agreement_rate(v35, pids_on_axis)
print(f" 7-model avg pairwise agreement: v3.0={v30_agree:.3f} → v3.5={v35_agree:.3f} (Δ={v35_agree - v30_agree:+.3f})")
print()
# ── Section 3: Human alignment improvement ───────────────────────────────────
print("=" * 80)
print("3. HUMAN ALIGNMENT IMPROVEMENT")
print("=" * 80)
print()
# Overall
pids_with_human = holdout_pids & set(human_majority.keys())
v30_agree_human = 0
v35_agree_human = 0
total_human = 0
for pid in pids_with_human:
hm = human_majority[pid]
m30 = majority_vote(v30, pid)
m35 = majority_vote(v35, pid)
if m30 is not None and m35 is not None:
total_human += 1
if m30 == hm:
v30_agree_human += 1
if m35 == hm:
v35_agree_human += 1
v30_pct = v30_agree_human / total_human * 100 if total_human else 0
v35_pct = v35_agree_human / total_human * 100 if total_human else 0
print(f"Overall (n={total_human}):")
print(f" v3.0 GenAI majority vs human majority: {v30_agree_human}/{total_human} ({v30_pct:.1f}%)")
print(f" v3.5 GenAI majority vs human majority: {v35_agree_human}/{total_human} ({v35_pct:.1f}%)")
print(f" Delta: {v35_pct - v30_pct:+.1f}pp")
print()
# By axis
print("By axis:")
header = f"{'Axis':<12} {'n':>4} {'v3.0 %':>8} {'v3.5 %':>8} {'Delta':>8}"
print(header)
print("-" * len(header))
for axis_key, axis_label in AXIS_LABELS.items():
pids_ax = axis_pids[axis_key] & pids_with_human
a30 = 0
a35 = 0
tot = 0
for pid in pids_ax:
hm = human_majority[pid]
m30 = majority_vote(v30, pid)
m35 = majority_vote(v35, pid)
if m30 is not None and m35 is not None:
tot += 1
if m30 == hm:
a30 += 1
if m35 == hm:
a35 += 1
p30 = a30 / tot * 100 if tot else 0
p35 = a35 / tot * 100 if tot else 0
print(f"{axis_label:<12} {tot:>4} {p30:>7.1f}% {p35:>7.1f}% {p35 - p30:>+7.1f}pp")
print()
# ── Section 4: SI↔N/O specific analysis ──────────────────────────────────────
print("=" * 80)
print("4. SI↔N/O SPECIFIC ANALYSIS")
print("=" * 80)
print()
si_no_pids = axis_pids["SI_NO"]
print(f"Paragraphs on SI↔N/O axis: {len(si_no_pids)}")
print()
# Per-model SI call rate
print("Per-model SI call rate:")
header = f"{'Model':<18} {'v3.0 SI':>8} {'v3.0 NO':>8} {'v3.5 SI':>8} {'v3.5 NO':>8} {'v3.0 SI%':>9} {'v3.5 SI%':>9}"
print(header)
print("-" * len(header))
for mname in MODEL_NAMES:
si30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "Strategy Integration")
no30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "None/Other")
si35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "Strategy Integration")
no35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "None/Other")
tot30 = si30 + no30 if (si30 + no30) > 0 else 1
tot35 = si35 + no35 if (si35 + no35) > 0 else 1
pct30 = si30 / len(si_no_pids) * 100
pct35 = si35 / len(si_no_pids) * 100
print(f"{mname:<18} {si30:>8} {no30:>8} {si35:>8} {no35:>8} {pct30:>8.1f}% {pct35:>8.1f}%")
print()
# N/O → SI switches per model
print("Models switching N/O → SI on SI↔N/O paragraphs:")
for mname in MODEL_NAMES:
switches = sum(
1 for pid in si_no_pids
if v30[mname].get(pid) == "None/Other" and v35[mname].get(pid) == "Strategy Integration"
)
reverse = sum(
1 for pid in si_no_pids
if v30[mname].get(pid) == "Strategy Integration" and v35[mname].get(pid) == "None/Other"
)
print(f" {mname:<18} N/O→SI: {switches:>3}, SI→N/O: {reverse:>3}")
print()
# Per-paragraph tally shift
print("Per-paragraph SI vs N/O tally (v3.0 → v3.5), showing shifts:")
print()
header = f"{'ParagraphId':<38} {'v3.0 SI':>7} {'v3.0 NO':>7} {'v3.5 SI':>7} {'v3.5 NO':>7} {'Human':>6} {'Resolved?':>10}"
print(header)
print("-" * len(header))
resolved_count = 0
total_si_no_with_human = 0
for pid in sorted(si_no_pids):
si30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "Strategy Integration")
no30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "None/Other")
si35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "Strategy Integration")
no35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "None/Other")
hm = human_majority.get(pid, "?")
hm_ab = abbrev(hm) if hm != "?" else "?"
# "Resolved" = v3.5 majority matches human majority
v35_maj = "SI" if si35 > no35 else ("NO" if no35 > si35 else "TIE")
resolved = "YES" if hm_ab == v35_maj else ("" if hm == "?" else "no")
if hm != "?":
total_si_no_with_human += 1
if hm_ab == v35_maj:
resolved_count += 1
print(f"{pid[:36]:<38} {si30:>7} {no30:>7} {si35:>7} {no35:>7} {hm_ab:>6} {resolved:>10}")
print()
print(f"SI↔N/O resolution rate (v3.5 majority matches human): {resolved_count}/{total_si_no_with_human} ({resolved_count / total_si_no_with_human * 100:.1f}%)" if total_si_no_with_human else "No human labels for SI↔N/O paragraphs")
# 23:0 asymmetry check
print()
print("23:0 asymmetry check:")
# In v3.0, how many SI↔N/O paragraphs had human=SI but GenAI majority=N/O?
asym_30 = sum(
1 for pid in si_no_pids
if human_majority.get(pid) == "Strategy Integration" and majority_vote(v30, pid) == "None/Other"
)
asym_35 = sum(
1 for pid in si_no_pids
if human_majority.get(pid) == "Strategy Integration" and majority_vote(v35, pid) == "None/Other"
)
print(f" v3.0: Human=SI but GenAI majority=N/O: {asym_30}")
print(f" v3.5: Human=SI but GenAI majority=N/O: {asym_35}")
rev_30 = sum(
1 for pid in si_no_pids
if human_majority.get(pid) == "None/Other" and majority_vote(v30, pid) == "Strategy Integration"
)
rev_35 = sum(
1 for pid in si_no_pids
if human_majority.get(pid) == "None/Other" and majority_vote(v35, pid) == "Strategy Integration"
)
print(f" v3.0: Human=N/O but GenAI majority=SI: {rev_30}")
print(f" v3.5: Human=N/O but GenAI majority=SI: {rev_35}")
print()
# ── Section 5: Per-model quality on confusion axes ───────────────────────────
print("=" * 80)
print("5. PER-MODEL ACCURACY ON CONFUSION-AXIS PARAGRAPHS (vs human majority)")
print("=" * 80)
print()
model_results = []
for mname in MODEL_NAMES:
correct_30 = 0
correct_35 = 0
total = 0
for pid in holdout_pids:
hm = human_majority.get(pid)
c30 = v30[mname].get(pid)
c35 = v35[mname].get(pid)
if hm and c30 and c35:
total += 1
if c30 == hm:
correct_30 += 1
if c35 == hm:
correct_35 += 1
acc30 = correct_30 / total * 100 if total else 0
acc35 = correct_35 / total * 100 if total else 0
model_results.append((mname, total, acc30, acc35, acc35 - acc30))
# Sort by v3.5 accuracy descending
model_results.sort(key=lambda x: -x[3])
header = f"{'Rank':>4} {'Model':<18} {'n':>5} {'v3.0 Acc':>9} {'v3.5 Acc':>9} {'Delta':>8}"
print(header)
print("-" * len(header))
for rank, (mname, total, acc30, acc35, delta) in enumerate(model_results, 1):
print(f"{rank:>4} {mname:<18} {total:>5} {acc30:>8.1f}% {acc35:>8.1f}% {delta:>+7.1f}pp")
print()
# ── Section 6: Model convergence ─────────────────────────────────────────────
print("=" * 80)
print("6. MODEL CONVERGENCE (pairwise agreement)")
print("=" * 80)
print()
v30_avg = agreement_rate(v30, holdout_pids)
v35_avg = agreement_rate(v35, holdout_pids)
print(f"Average pairwise agreement among 7 models:")
print(f" v3.0: {v30_avg:.3f}")
print(f" v3.5: {v35_avg:.3f}")
print(f" Delta: {v35_avg - v30_avg:+.3f}")
print()
# Per-model average agreement with others
print("Per-model average agreement with other 6 models:")
header = f"{'Model':<18} {'v3.0':>8} {'v3.5':>8} {'Delta':>8}"
print(header)
print("-" * len(header))
v30_mat = pairwise_agreement_matrix(v30, holdout_pids)
v35_mat = pairwise_agreement_matrix(v35, holdout_pids)
for i, mname in enumerate(MODEL_NAMES):
# Average agreement with other models (exclude self)
others_30 = [v30_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
others_35 = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
avg30 = np.mean(others_30)
avg35 = np.mean(others_35)
print(f"{mname:<18} {avg30:>7.3f} {avg35:>7.3f} {avg35 - avg30:>+7.3f}")
print()
# Outlier detection
print("Outlier check (models with lowest v3.5 agreement):")
v35_avgs = []
for i, mname in enumerate(MODEL_NAMES):
others = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
v35_avgs.append((mname, np.mean(others)))
v35_avgs.sort(key=lambda x: x[1])
mean_agree = np.mean([x[1] for x in v35_avgs])
std_agree = np.std([x[1] for x in v35_avgs])
for mname, avg in v35_avgs:
z = (avg - mean_agree) / std_agree if std_agree > 0 else 0
flag = " *** OUTLIER" if z < -1.5 else ""
print(f" {mname:<18} {avg:.3f} (z={z:+.2f}){flag}")
print()
print("=" * 80)
print("DONE")
print("=" * 80)