519 lines
18 KiB
Python
519 lines
18 KiB
Python
"""Compare v3.0 vs v3.5 annotations on 359 confusion-axis holdout paragraphs."""
|
|
|
|
import json
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
# ── Paths ──────────────────────────────────────────────────────────────────────
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl"
|
|
V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl"
|
|
|
|
V30_BENCH = ROOT / "data/annotations/bench-holdout"
|
|
V35_BENCH = ROOT / "data/annotations/bench-holdout-v35"
|
|
|
|
HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl"
|
|
HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl"
|
|
|
|
MODEL_FILES = [
|
|
"opus.jsonl", # golden dirs
|
|
"gpt-5.4.jsonl",
|
|
"gemini-3.1-pro-preview.jsonl",
|
|
"glm-5:exacto.jsonl",
|
|
"kimi-k2.5.jsonl",
|
|
"mimo-v2-pro:exacto.jsonl",
|
|
"minimax-m2.7:exacto.jsonl",
|
|
]
|
|
|
|
MODEL_NAMES = [
|
|
"Opus",
|
|
"GPT-5.4",
|
|
"Gemini-3.1-Pro",
|
|
"GLM-5",
|
|
"Kimi-K2.5",
|
|
"Mimo-v2-Pro",
|
|
"MiniMax-M2.7",
|
|
]
|
|
|
|
# Category abbreviations used in axes
|
|
CAT_ABBREV = {
|
|
"BG": "Board Governance",
|
|
"MR": "Management Role",
|
|
"RMP": "Risk Management Process",
|
|
"SI": "Strategy Integration",
|
|
"NO": "None/Other",
|
|
"ID": "Incident Disclosure",
|
|
"TPR": "Third-Party Risk",
|
|
}
|
|
|
|
ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()}
|
|
|
|
|
|
def abbrev(cat: str) -> str:
|
|
return ABBREV_CAT.get(cat, cat)
|
|
|
|
|
|
def full_cat(ab: str) -> str:
|
|
return CAT_ABBREV.get(ab, ab)
|
|
|
|
|
|
# ── Load data ──────────────────────────────────────────────────────────────────
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
with open(path) as f:
|
|
return [json.loads(line) for line in f if line.strip()]
|
|
|
|
|
|
def load_annotations(base_dir: Path, filename: str) -> dict[str, str]:
|
|
"""Load paragraphId → content_category mapping."""
|
|
path = base_dir / filename
|
|
records = load_jsonl(path)
|
|
return {r["paragraphId"]: r["label"]["content_category"] for r in records}
|
|
|
|
|
|
def load_golden(path: Path) -> dict[str, str]:
|
|
records = load_jsonl(path)
|
|
return {r["paragraphId"]: r["label"]["content_category"] for r in records}
|
|
|
|
|
|
# Load holdout metadata
|
|
holdout_records = load_jsonl(HOLDOUT_META)
|
|
holdout_pids = {r["paragraphId"] for r in holdout_records}
|
|
pid_axes = {r["paragraphId"]: r["axes"] for r in holdout_records}
|
|
pid_materiality = {r["paragraphId"]: r.get("hasMaterialityLanguage", False) for r in holdout_records}
|
|
|
|
assert len(holdout_pids) == 359, f"Expected 359 holdout PIDs, got {len(holdout_pids)}"
|
|
|
|
# Load v3.0 annotations per model (filtered to 359 holdout PIDs)
|
|
v30: dict[str, dict[str, str]] = {} # model_name → {pid → category}
|
|
v35: dict[str, dict[str, str]] = {}
|
|
|
|
for i, (fname, mname) in enumerate(zip(MODEL_FILES, MODEL_NAMES)):
|
|
if fname == "opus.jsonl":
|
|
v30_all = load_golden(V30_GOLDEN)
|
|
v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all}
|
|
v35[mname] = load_golden(V35_GOLDEN)
|
|
else:
|
|
v30_all = load_annotations(V30_BENCH, fname)
|
|
v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all}
|
|
v35[mname] = load_annotations(V35_BENCH, fname)
|
|
|
|
# Load human labels
|
|
human_raw = load_jsonl(HUMAN_LABELS)
|
|
# Group by paragraphId, compute majority
|
|
human_labels_by_pid: dict[str, list[str]] = defaultdict(list)
|
|
for rec in human_raw:
|
|
human_labels_by_pid[rec["paragraphId"]].append(rec["contentCategory"])
|
|
|
|
human_majority: dict[str, str] = {}
|
|
for pid, labels in human_labels_by_pid.items():
|
|
counts = Counter(labels)
|
|
human_majority[pid] = counts.most_common(1)[0][0]
|
|
|
|
# Axes grouping
|
|
axis_pids: dict[str, set[str]] = defaultdict(set)
|
|
for pid, axes in pid_axes.items():
|
|
for ax in axes:
|
|
axis_pids[ax].add(pid)
|
|
|
|
AXIS_LABELS = {
|
|
"SI_NO": "SI↔N/O",
|
|
"MR_RMP": "MR↔RMP",
|
|
"BG_MR": "BG↔MR",
|
|
"BG_RMP": "BG↔RMP",
|
|
}
|
|
|
|
|
|
# ── Helpers ────────────────────────────────────────────────────────────────────
|
|
|
|
def majority_vote(model_cats: dict[str, dict[str, str]], pid: str) -> str | None:
|
|
"""Get majority category across all models for a PID."""
|
|
votes = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]]
|
|
votes = [v for v in votes if v is not None]
|
|
if not votes:
|
|
return None
|
|
counts = Counter(votes)
|
|
return counts.most_common(1)[0][0]
|
|
|
|
|
|
def agreement_rate(model_cats: dict[str, dict[str, str]], pids: set[str]) -> float:
|
|
"""Average pairwise agreement among 7 models on given PIDs."""
|
|
total_pairs = 0
|
|
agree_pairs = 0
|
|
for pid in pids:
|
|
cats = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]]
|
|
cats = [c for c in cats if c is not None]
|
|
n = len(cats)
|
|
for i in range(n):
|
|
for j in range(i + 1, n):
|
|
total_pairs += 1
|
|
if cats[i] == cats[j]:
|
|
agree_pairs += 1
|
|
return agree_pairs / total_pairs if total_pairs > 0 else 0.0
|
|
|
|
|
|
def pairwise_agreement_matrix(model_cats: dict[str, dict[str, str]], pids: set[str]) -> np.ndarray:
|
|
"""Return 7x7 pairwise agreement matrix."""
|
|
n = len(MODEL_NAMES)
|
|
mat = np.zeros((n, n))
|
|
for i in range(n):
|
|
for j in range(n):
|
|
if i == j:
|
|
mat[i, j] = 1.0
|
|
continue
|
|
agree = 0
|
|
total = 0
|
|
for pid in pids:
|
|
ci = model_cats[MODEL_NAMES[i]].get(pid)
|
|
cj = model_cats[MODEL_NAMES[j]].get(pid)
|
|
if ci is not None and cj is not None:
|
|
total += 1
|
|
if ci == cj:
|
|
agree += 1
|
|
mat[i, j] = agree / total if total > 0 else 0.0
|
|
return mat
|
|
|
|
|
|
# ── Section 1: Per-model category change rate ─────────────────────────────────
|
|
|
|
print("=" * 80)
|
|
print("1. PER-MODEL CATEGORY CHANGE RATE (v3.0 → v3.5)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
header = f"{'Model':<18} {'Changed':>8} {'Total':>6} {'% Changed':>10}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
for mname in MODEL_NAMES:
|
|
changed = 0
|
|
total = 0
|
|
for pid in holdout_pids:
|
|
c30 = v30[mname].get(pid)
|
|
c35 = v35[mname].get(pid)
|
|
if c30 is not None and c35 is not None:
|
|
total += 1
|
|
if c30 != c35:
|
|
changed += 1
|
|
pct = (changed / total * 100) if total > 0 else 0
|
|
print(f"{mname:<18} {changed:>8} {total:>6} {pct:>9.1f}%")
|
|
|
|
print()
|
|
|
|
# Top transitions per model
|
|
print("Top category transitions per model:")
|
|
print()
|
|
for mname in MODEL_NAMES:
|
|
transitions: Counter = Counter()
|
|
for pid in holdout_pids:
|
|
c30 = v30[mname].get(pid)
|
|
c35 = v35[mname].get(pid)
|
|
if c30 is not None and c35 is not None and c30 != c35:
|
|
transitions[(abbrev(c30), abbrev(c35))] += 1
|
|
if transitions:
|
|
top = transitions.most_common(5)
|
|
parts = [f"{a}→{b} ({n})" for (a, b), n in top]
|
|
print(f" {mname:<18} {', '.join(parts)}")
|
|
|
|
print()
|
|
|
|
# ── Section 2: Per-axis resolution analysis ───────────────────────────────────
|
|
|
|
print("=" * 80)
|
|
print("2. PER-AXIS RESOLUTION ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for axis_key, axis_label in AXIS_LABELS.items():
|
|
pids_on_axis = axis_pids[axis_key]
|
|
cat_a, cat_b = axis_key.split("_")
|
|
|
|
print(f"--- {axis_label} ({len(pids_on_axis)} paragraphs) ---")
|
|
print()
|
|
|
|
# v3.0 and v3.5 majorities
|
|
v30_maj = {pid: majority_vote(v30, pid) for pid in pids_on_axis}
|
|
v35_maj = {pid: majority_vote(v35, pid) for pid in pids_on_axis}
|
|
|
|
# Majority distribution
|
|
v30_dist = Counter(v for v in v30_maj.values() if v)
|
|
v35_dist = Counter(v for v in v35_maj.values() if v)
|
|
|
|
print(f" v3.0 majority distribution: ", end="")
|
|
print(", ".join(f"{abbrev(k)}={v}" for k, v in v30_dist.most_common()))
|
|
|
|
print(f" v3.5 majority distribution: ", end="")
|
|
print(", ".join(f"{abbrev(k)}={v}" for k, v in v35_dist.most_common()))
|
|
|
|
# Flipped majority
|
|
flipped = sum(
|
|
1 for pid in pids_on_axis
|
|
if v30_maj.get(pid) and v35_maj.get(pid) and v30_maj[pid] != v35_maj[pid]
|
|
)
|
|
print(f" Paragraphs with flipped majority: {flipped}/{len(pids_on_axis)} ({flipped / len(pids_on_axis) * 100:.1f}%)")
|
|
|
|
# New agreement rate (7-model)
|
|
v30_agree = agreement_rate(v30, pids_on_axis)
|
|
v35_agree = agreement_rate(v35, pids_on_axis)
|
|
print(f" 7-model avg pairwise agreement: v3.0={v30_agree:.3f} → v3.5={v35_agree:.3f} (Δ={v35_agree - v30_agree:+.3f})")
|
|
print()
|
|
|
|
# ── Section 3: Human alignment improvement ───────────────────────────────────
|
|
|
|
print("=" * 80)
|
|
print("3. HUMAN ALIGNMENT IMPROVEMENT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Overall
|
|
pids_with_human = holdout_pids & set(human_majority.keys())
|
|
|
|
v30_agree_human = 0
|
|
v35_agree_human = 0
|
|
total_human = 0
|
|
|
|
for pid in pids_with_human:
|
|
hm = human_majority[pid]
|
|
m30 = majority_vote(v30, pid)
|
|
m35 = majority_vote(v35, pid)
|
|
if m30 is not None and m35 is not None:
|
|
total_human += 1
|
|
if m30 == hm:
|
|
v30_agree_human += 1
|
|
if m35 == hm:
|
|
v35_agree_human += 1
|
|
|
|
v30_pct = v30_agree_human / total_human * 100 if total_human else 0
|
|
v35_pct = v35_agree_human / total_human * 100 if total_human else 0
|
|
|
|
print(f"Overall (n={total_human}):")
|
|
print(f" v3.0 GenAI majority vs human majority: {v30_agree_human}/{total_human} ({v30_pct:.1f}%)")
|
|
print(f" v3.5 GenAI majority vs human majority: {v35_agree_human}/{total_human} ({v35_pct:.1f}%)")
|
|
print(f" Delta: {v35_pct - v30_pct:+.1f}pp")
|
|
print()
|
|
|
|
# By axis
|
|
print("By axis:")
|
|
header = f"{'Axis':<12} {'n':>4} {'v3.0 %':>8} {'v3.5 %':>8} {'Delta':>8}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
for axis_key, axis_label in AXIS_LABELS.items():
|
|
pids_ax = axis_pids[axis_key] & pids_with_human
|
|
a30 = 0
|
|
a35 = 0
|
|
tot = 0
|
|
for pid in pids_ax:
|
|
hm = human_majority[pid]
|
|
m30 = majority_vote(v30, pid)
|
|
m35 = majority_vote(v35, pid)
|
|
if m30 is not None and m35 is not None:
|
|
tot += 1
|
|
if m30 == hm:
|
|
a30 += 1
|
|
if m35 == hm:
|
|
a35 += 1
|
|
p30 = a30 / tot * 100 if tot else 0
|
|
p35 = a35 / tot * 100 if tot else 0
|
|
print(f"{axis_label:<12} {tot:>4} {p30:>7.1f}% {p35:>7.1f}% {p35 - p30:>+7.1f}pp")
|
|
|
|
print()
|
|
|
|
# ── Section 4: SI↔N/O specific analysis ──────────────────────────────────────
|
|
|
|
print("=" * 80)
|
|
print("4. SI↔N/O SPECIFIC ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
si_no_pids = axis_pids["SI_NO"]
|
|
print(f"Paragraphs on SI↔N/O axis: {len(si_no_pids)}")
|
|
print()
|
|
|
|
# Per-model SI call rate
|
|
print("Per-model SI call rate:")
|
|
header = f"{'Model':<18} {'v3.0 SI':>8} {'v3.0 NO':>8} {'v3.5 SI':>8} {'v3.5 NO':>8} {'v3.0 SI%':>9} {'v3.5 SI%':>9}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
for mname in MODEL_NAMES:
|
|
si30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "Strategy Integration")
|
|
no30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "None/Other")
|
|
si35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "Strategy Integration")
|
|
no35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "None/Other")
|
|
tot30 = si30 + no30 if (si30 + no30) > 0 else 1
|
|
tot35 = si35 + no35 if (si35 + no35) > 0 else 1
|
|
pct30 = si30 / len(si_no_pids) * 100
|
|
pct35 = si35 / len(si_no_pids) * 100
|
|
print(f"{mname:<18} {si30:>8} {no30:>8} {si35:>8} {no35:>8} {pct30:>8.1f}% {pct35:>8.1f}%")
|
|
|
|
print()
|
|
|
|
# N/O → SI switches per model
|
|
print("Models switching N/O → SI on SI↔N/O paragraphs:")
|
|
for mname in MODEL_NAMES:
|
|
switches = sum(
|
|
1 for pid in si_no_pids
|
|
if v30[mname].get(pid) == "None/Other" and v35[mname].get(pid) == "Strategy Integration"
|
|
)
|
|
reverse = sum(
|
|
1 for pid in si_no_pids
|
|
if v30[mname].get(pid) == "Strategy Integration" and v35[mname].get(pid) == "None/Other"
|
|
)
|
|
print(f" {mname:<18} N/O→SI: {switches:>3}, SI→N/O: {reverse:>3}")
|
|
|
|
print()
|
|
|
|
# Per-paragraph tally shift
|
|
print("Per-paragraph SI vs N/O tally (v3.0 → v3.5), showing shifts:")
|
|
print()
|
|
header = f"{'ParagraphId':<38} {'v3.0 SI':>7} {'v3.0 NO':>7} {'v3.5 SI':>7} {'v3.5 NO':>7} {'Human':>6} {'Resolved?':>10}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
resolved_count = 0
|
|
total_si_no_with_human = 0
|
|
for pid in sorted(si_no_pids):
|
|
si30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "Strategy Integration")
|
|
no30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "None/Other")
|
|
si35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "Strategy Integration")
|
|
no35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "None/Other")
|
|
hm = human_majority.get(pid, "?")
|
|
hm_ab = abbrev(hm) if hm != "?" else "?"
|
|
|
|
# "Resolved" = v3.5 majority matches human majority
|
|
v35_maj = "SI" if si35 > no35 else ("NO" if no35 > si35 else "TIE")
|
|
resolved = "YES" if hm_ab == v35_maj else ("" if hm == "?" else "no")
|
|
if hm != "?":
|
|
total_si_no_with_human += 1
|
|
if hm_ab == v35_maj:
|
|
resolved_count += 1
|
|
|
|
print(f"{pid[:36]:<38} {si30:>7} {no30:>7} {si35:>7} {no35:>7} {hm_ab:>6} {resolved:>10}")
|
|
|
|
print()
|
|
print(f"SI↔N/O resolution rate (v3.5 majority matches human): {resolved_count}/{total_si_no_with_human} ({resolved_count / total_si_no_with_human * 100:.1f}%)" if total_si_no_with_human else "No human labels for SI↔N/O paragraphs")
|
|
|
|
# 23:0 asymmetry check
|
|
print()
|
|
print("23:0 asymmetry check:")
|
|
# In v3.0, how many SI↔N/O paragraphs had human=SI but GenAI majority=N/O?
|
|
asym_30 = sum(
|
|
1 for pid in si_no_pids
|
|
if human_majority.get(pid) == "Strategy Integration" and majority_vote(v30, pid) == "None/Other"
|
|
)
|
|
asym_35 = sum(
|
|
1 for pid in si_no_pids
|
|
if human_majority.get(pid) == "Strategy Integration" and majority_vote(v35, pid) == "None/Other"
|
|
)
|
|
print(f" v3.0: Human=SI but GenAI majority=N/O: {asym_30}")
|
|
print(f" v3.5: Human=SI but GenAI majority=N/O: {asym_35}")
|
|
rev_30 = sum(
|
|
1 for pid in si_no_pids
|
|
if human_majority.get(pid) == "None/Other" and majority_vote(v30, pid) == "Strategy Integration"
|
|
)
|
|
rev_35 = sum(
|
|
1 for pid in si_no_pids
|
|
if human_majority.get(pid) == "None/Other" and majority_vote(v35, pid) == "Strategy Integration"
|
|
)
|
|
print(f" v3.0: Human=N/O but GenAI majority=SI: {rev_30}")
|
|
print(f" v3.5: Human=N/O but GenAI majority=SI: {rev_35}")
|
|
|
|
print()
|
|
|
|
# ── Section 5: Per-model quality on confusion axes ───────────────────────────
|
|
|
|
print("=" * 80)
|
|
print("5. PER-MODEL ACCURACY ON CONFUSION-AXIS PARAGRAPHS (vs human majority)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
model_results = []
|
|
for mname in MODEL_NAMES:
|
|
correct_30 = 0
|
|
correct_35 = 0
|
|
total = 0
|
|
for pid in holdout_pids:
|
|
hm = human_majority.get(pid)
|
|
c30 = v30[mname].get(pid)
|
|
c35 = v35[mname].get(pid)
|
|
if hm and c30 and c35:
|
|
total += 1
|
|
if c30 == hm:
|
|
correct_30 += 1
|
|
if c35 == hm:
|
|
correct_35 += 1
|
|
acc30 = correct_30 / total * 100 if total else 0
|
|
acc35 = correct_35 / total * 100 if total else 0
|
|
model_results.append((mname, total, acc30, acc35, acc35 - acc30))
|
|
|
|
# Sort by v3.5 accuracy descending
|
|
model_results.sort(key=lambda x: -x[3])
|
|
|
|
header = f"{'Rank':>4} {'Model':<18} {'n':>5} {'v3.0 Acc':>9} {'v3.5 Acc':>9} {'Delta':>8}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
for rank, (mname, total, acc30, acc35, delta) in enumerate(model_results, 1):
|
|
print(f"{rank:>4} {mname:<18} {total:>5} {acc30:>8.1f}% {acc35:>8.1f}% {delta:>+7.1f}pp")
|
|
|
|
print()
|
|
|
|
# ── Section 6: Model convergence ─────────────────────────────────────────────
|
|
|
|
print("=" * 80)
|
|
print("6. MODEL CONVERGENCE (pairwise agreement)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
v30_avg = agreement_rate(v30, holdout_pids)
|
|
v35_avg = agreement_rate(v35, holdout_pids)
|
|
|
|
print(f"Average pairwise agreement among 7 models:")
|
|
print(f" v3.0: {v30_avg:.3f}")
|
|
print(f" v3.5: {v35_avg:.3f}")
|
|
print(f" Delta: {v35_avg - v30_avg:+.3f}")
|
|
print()
|
|
|
|
# Per-model average agreement with others
|
|
print("Per-model average agreement with other 6 models:")
|
|
header = f"{'Model':<18} {'v3.0':>8} {'v3.5':>8} {'Delta':>8}"
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
v30_mat = pairwise_agreement_matrix(v30, holdout_pids)
|
|
v35_mat = pairwise_agreement_matrix(v35, holdout_pids)
|
|
|
|
for i, mname in enumerate(MODEL_NAMES):
|
|
# Average agreement with other models (exclude self)
|
|
others_30 = [v30_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
|
|
others_35 = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
|
|
avg30 = np.mean(others_30)
|
|
avg35 = np.mean(others_35)
|
|
print(f"{mname:<18} {avg30:>7.3f} {avg35:>7.3f} {avg35 - avg30:>+7.3f}")
|
|
|
|
print()
|
|
|
|
# Outlier detection
|
|
print("Outlier check (models with lowest v3.5 agreement):")
|
|
v35_avgs = []
|
|
for i, mname in enumerate(MODEL_NAMES):
|
|
others = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
|
|
v35_avgs.append((mname, np.mean(others)))
|
|
|
|
v35_avgs.sort(key=lambda x: x[1])
|
|
mean_agree = np.mean([x[1] for x in v35_avgs])
|
|
std_agree = np.std([x[1] for x in v35_avgs])
|
|
|
|
for mname, avg in v35_avgs:
|
|
z = (avg - mean_agree) / std_agree if std_agree > 0 else 0
|
|
flag = " *** OUTLIER" if z < -1.5 else ""
|
|
print(f" {mname:<18} {avg:.3f} (z={z:+.2f}){flag}")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("DONE")
|
|
print("=" * 80)
|