SEC-cyBERT/scripts/compare-v30-v35.py

"""Compare v3.0 vs v3.5 annotations on 359 confusion-axis holdout paragraphs."""

import json
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np

# ── Paths ──────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent

V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl"
V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl"

V30_BENCH = ROOT / "data/annotations/bench-holdout"
V35_BENCH = ROOT / "data/annotations/bench-holdout-v35"

HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl"
HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl"

MODEL_FILES = [
    "opus.jsonl",  # golden dirs
    "gpt-5.4.jsonl",
    "gemini-3.1-pro-preview.jsonl",
    "glm-5:exacto.jsonl",
    "kimi-k2.5.jsonl",
    "mimo-v2-pro:exacto.jsonl",
    "minimax-m2.7:exacto.jsonl",
]

MODEL_NAMES = [
    "Opus",
    "GPT-5.4",
    "Gemini-3.1-Pro",
    "GLM-5",
    "Kimi-K2.5",
    "Mimo-v2-Pro",
    "MiniMax-M2.7",
]

# Category abbreviations used in axes
CAT_ABBREV = {
    "BG": "Board Governance",
    "MR": "Management Role",
    "RMP": "Risk Management Process",
    "SI": "Strategy Integration",
    "NO": "None/Other",
    "ID": "Incident Disclosure",
    "TPR": "Third-Party Risk",
}

ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()}


def abbrev(cat: str) -> str:
    return ABBREV_CAT.get(cat, cat)


def full_cat(ab: str) -> str:
    return CAT_ABBREV.get(ab, ab)


# ── Load data ──────────────────────────────────────────────────────────────────

def load_jsonl(path: Path) -> list[dict]:
    with open(path) as f:
        return [json.loads(line) for line in f if line.strip()]


def load_annotations(base_dir: Path, filename: str) -> dict[str, str]:
    """Load paragraphId → content_category mapping."""
    path = base_dir / filename
    records = load_jsonl(path)
    return {r["paragraphId"]: r["label"]["content_category"] for r in records}


def load_golden(path: Path) -> dict[str, str]:
    records = load_jsonl(path)
    return {r["paragraphId"]: r["label"]["content_category"] for r in records}


# Load holdout metadata
holdout_records = load_jsonl(HOLDOUT_META)
holdout_pids = {r["paragraphId"] for r in holdout_records}
pid_axes = {r["paragraphId"]: r["axes"] for r in holdout_records}
pid_materiality = {r["paragraphId"]: r.get("hasMaterialityLanguage", False) for r in holdout_records}

assert len(holdout_pids) == 359, f"Expected 359 holdout PIDs, got {len(holdout_pids)}"

# Load v3.0 annotations per model (filtered to 359 holdout PIDs)
v30: dict[str, dict[str, str]] = {}  # model_name → {pid → category}
v35: dict[str, dict[str, str]] = {}

for i, (fname, mname) in enumerate(zip(MODEL_FILES, MODEL_NAMES)):
    if fname == "opus.jsonl":
        v30_all = load_golden(V30_GOLDEN)
        v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all}
        v35[mname] = load_golden(V35_GOLDEN)
    else:
        v30_all = load_annotations(V30_BENCH, fname)
        v30[mname] = {pid: v30_all[pid] for pid in holdout_pids if pid in v30_all}
        v35[mname] = load_annotations(V35_BENCH, fname)

# Load human labels
human_raw = load_jsonl(HUMAN_LABELS)
# Group by paragraphId, compute majority
human_labels_by_pid: dict[str, list[str]] = defaultdict(list)
for rec in human_raw:
    human_labels_by_pid[rec["paragraphId"]].append(rec["contentCategory"])

human_majority: dict[str, str] = {}
for pid, labels in human_labels_by_pid.items():
    counts = Counter(labels)
    human_majority[pid] = counts.most_common(1)[0][0]

# Axes grouping
axis_pids: dict[str, set[str]] = defaultdict(set)
for pid, axes in pid_axes.items():
    for ax in axes:
        axis_pids[ax].add(pid)

AXIS_LABELS = {
    "SI_NO": "SI↔N/O",
    "MR_RMP": "MR↔RMP",
    "BG_MR": "BG↔MR",
    "BG_RMP": "BG↔RMP",
}


# ── Helpers ────────────────────────────────────────────────────────────────────

def majority_vote(model_cats: dict[str, dict[str, str]], pid: str) -> str | None:
    """Get majority category across all models for a PID."""
    votes = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]]
    votes = [v for v in votes if v is not None]
    if not votes:
        return None
    counts = Counter(votes)
    return counts.most_common(1)[0][0]


def agreement_rate(model_cats: dict[str, dict[str, str]], pids: set[str]) -> float:
    """Average pairwise agreement among 7 models on given PIDs."""
    total_pairs = 0
    agree_pairs = 0
    for pid in pids:
        cats = [model_cats[m].get(pid) for m in MODEL_NAMES if pid in model_cats[m]]
        cats = [c for c in cats if c is not None]
        n = len(cats)
        for i in range(n):
            for j in range(i + 1, n):
                total_pairs += 1
                if cats[i] == cats[j]:
                    agree_pairs += 1
    return agree_pairs / total_pairs if total_pairs > 0 else 0.0


def pairwise_agreement_matrix(model_cats: dict[str, dict[str, str]], pids: set[str]) -> np.ndarray:
    """Return 7x7 pairwise agreement matrix."""
    n = len(MODEL_NAMES)
    mat = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                mat[i, j] = 1.0
                continue
            agree = 0
            total = 0
            for pid in pids:
                ci = model_cats[MODEL_NAMES[i]].get(pid)
                cj = model_cats[MODEL_NAMES[j]].get(pid)
                if ci is not None and cj is not None:
                    total += 1
                    if ci == cj:
                        agree += 1
            mat[i, j] = agree / total if total > 0 else 0.0
    return mat


# ── Section 1: Per-model category change rate ─────────────────────────────────

print("=" * 80)
print("1. PER-MODEL CATEGORY CHANGE RATE (v3.0 → v3.5)")
print("=" * 80)
print()

header = f"{'Model':<18} {'Changed':>8} {'Total':>6} {'% Changed':>10}"
print(header)
print("-" * len(header))

for mname in MODEL_NAMES:
    changed = 0
    total = 0
    for pid in holdout_pids:
        c30 = v30[mname].get(pid)
        c35 = v35[mname].get(pid)
        if c30 is not None and c35 is not None:
            total += 1
            if c30 != c35:
                changed += 1
    pct = (changed / total * 100) if total > 0 else 0
    print(f"{mname:<18} {changed:>8} {total:>6} {pct:>9.1f}%")

print()

# Top transitions per model
print("Top category transitions per model:")
print()
for mname in MODEL_NAMES:
    transitions: Counter = Counter()
    for pid in holdout_pids:
        c30 = v30[mname].get(pid)
        c35 = v35[mname].get(pid)
        if c30 is not None and c35 is not None and c30 != c35:
            transitions[(abbrev(c30), abbrev(c35))] += 1
    if transitions:
        top = transitions.most_common(5)
        parts = [f"{a}→{b} ({n})" for (a, b), n in top]
        print(f"  {mname:<18} {', '.join(parts)}")

print()

# ── Section 2: Per-axis resolution analysis ───────────────────────────────────

print("=" * 80)
print("2. PER-AXIS RESOLUTION ANALYSIS")
print("=" * 80)
print()

for axis_key, axis_label in AXIS_LABELS.items():
    pids_on_axis = axis_pids[axis_key]
    cat_a, cat_b = axis_key.split("_")

    print(f"--- {axis_label} ({len(pids_on_axis)} paragraphs) ---")
    print()

    # v3.0 and v3.5 majorities
    v30_maj = {pid: majority_vote(v30, pid) for pid in pids_on_axis}
    v35_maj = {pid: majority_vote(v35, pid) for pid in pids_on_axis}

    # Majority distribution
    v30_dist = Counter(v for v in v30_maj.values() if v)
    v35_dist = Counter(v for v in v35_maj.values() if v)

    print(f"  v3.0 majority distribution: ", end="")
    print(", ".join(f"{abbrev(k)}={v}" for k, v in v30_dist.most_common()))

    print(f"  v3.5 majority distribution: ", end="")
    print(", ".join(f"{abbrev(k)}={v}" for k, v in v35_dist.most_common()))

    # Flipped majority
    flipped = sum(
        1 for pid in pids_on_axis
        if v30_maj.get(pid) and v35_maj.get(pid) and v30_maj[pid] != v35_maj[pid]
    )
    print(f"  Paragraphs with flipped majority: {flipped}/{len(pids_on_axis)} ({flipped / len(pids_on_axis) * 100:.1f}%)")

    # New agreement rate (7-model)
    v30_agree = agreement_rate(v30, pids_on_axis)
    v35_agree = agreement_rate(v35, pids_on_axis)
    print(f"  7-model avg pairwise agreement: v3.0={v30_agree:.3f} → v3.5={v35_agree:.3f} (Δ={v35_agree - v30_agree:+.3f})")
    print()

# ── Section 3: Human alignment improvement ───────────────────────────────────

print("=" * 80)
print("3. HUMAN ALIGNMENT IMPROVEMENT")
print("=" * 80)
print()

# Overall
pids_with_human = holdout_pids & set(human_majority.keys())

v30_agree_human = 0
v35_agree_human = 0
total_human = 0

for pid in pids_with_human:
    hm = human_majority[pid]
    m30 = majority_vote(v30, pid)
    m35 = majority_vote(v35, pid)
    if m30 is not None and m35 is not None:
        total_human += 1
        if m30 == hm:
            v30_agree_human += 1
        if m35 == hm:
            v35_agree_human += 1

v30_pct = v30_agree_human / total_human * 100 if total_human else 0
v35_pct = v35_agree_human / total_human * 100 if total_human else 0

print(f"Overall (n={total_human}):")
print(f"  v3.0 GenAI majority vs human majority: {v30_agree_human}/{total_human} ({v30_pct:.1f}%)")
print(f"  v3.5 GenAI majority vs human majority: {v35_agree_human}/{total_human} ({v35_pct:.1f}%)")
print(f"  Delta: {v35_pct - v30_pct:+.1f}pp")
print()

# By axis
print("By axis:")
header = f"{'Axis':<12} {'n':>4} {'v3.0 %':>8} {'v3.5 %':>8} {'Delta':>8}"
print(header)
print("-" * len(header))

for axis_key, axis_label in AXIS_LABELS.items():
    pids_ax = axis_pids[axis_key] & pids_with_human
    a30 = 0
    a35 = 0
    tot = 0
    for pid in pids_ax:
        hm = human_majority[pid]
        m30 = majority_vote(v30, pid)
        m35 = majority_vote(v35, pid)
        if m30 is not None and m35 is not None:
            tot += 1
            if m30 == hm:
                a30 += 1
            if m35 == hm:
                a35 += 1
    p30 = a30 / tot * 100 if tot else 0
    p35 = a35 / tot * 100 if tot else 0
    print(f"{axis_label:<12} {tot:>4} {p30:>7.1f}% {p35:>7.1f}% {p35 - p30:>+7.1f}pp")

print()

# ── Section 4: SI↔N/O specific analysis ──────────────────────────────────────

print("=" * 80)
print("4. SI↔N/O SPECIFIC ANALYSIS")
print("=" * 80)
print()

si_no_pids = axis_pids["SI_NO"]
print(f"Paragraphs on SI↔N/O axis: {len(si_no_pids)}")
print()

# Per-model SI call rate
print("Per-model SI call rate:")
header = f"{'Model':<18} {'v3.0 SI':>8} {'v3.0 NO':>8} {'v3.5 SI':>8} {'v3.5 NO':>8} {'v3.0 SI%':>9} {'v3.5 SI%':>9}"
print(header)
print("-" * len(header))

for mname in MODEL_NAMES:
    si30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "Strategy Integration")
    no30 = sum(1 for pid in si_no_pids if v30[mname].get(pid) == "None/Other")
    si35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "Strategy Integration")
    no35 = sum(1 for pid in si_no_pids if v35[mname].get(pid) == "None/Other")
    tot30 = si30 + no30 if (si30 + no30) > 0 else 1
    tot35 = si35 + no35 if (si35 + no35) > 0 else 1
    pct30 = si30 / len(si_no_pids) * 100
    pct35 = si35 / len(si_no_pids) * 100
    print(f"{mname:<18} {si30:>8} {no30:>8} {si35:>8} {no35:>8} {pct30:>8.1f}% {pct35:>8.1f}%")

print()

# N/O → SI switches per model
print("Models switching N/O → SI on SI↔N/O paragraphs:")
for mname in MODEL_NAMES:
    switches = sum(
        1 for pid in si_no_pids
        if v30[mname].get(pid) == "None/Other" and v35[mname].get(pid) == "Strategy Integration"
    )
    reverse = sum(
        1 for pid in si_no_pids
        if v30[mname].get(pid) == "Strategy Integration" and v35[mname].get(pid) == "None/Other"
    )
    print(f"  {mname:<18} N/O→SI: {switches:>3}, SI→N/O: {reverse:>3}")

print()

# Per-paragraph tally shift
print("Per-paragraph SI vs N/O tally (v3.0 → v3.5), showing shifts:")
print()
header = f"{'ParagraphId':<38} {'v3.0 SI':>7} {'v3.0 NO':>7} {'v3.5 SI':>7} {'v3.5 NO':>7} {'Human':>6} {'Resolved?':>10}"
print(header)
print("-" * len(header))

resolved_count = 0
total_si_no_with_human = 0
for pid in sorted(si_no_pids):
    si30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "Strategy Integration")
    no30 = sum(1 for m in MODEL_NAMES if v30[m].get(pid) == "None/Other")
    si35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "Strategy Integration")
    no35 = sum(1 for m in MODEL_NAMES if v35[m].get(pid) == "None/Other")
    hm = human_majority.get(pid, "?")
    hm_ab = abbrev(hm) if hm != "?" else "?"

    # "Resolved" = v3.5 majority matches human majority
    v35_maj = "SI" if si35 > no35 else ("NO" if no35 > si35 else "TIE")
    resolved = "YES" if hm_ab == v35_maj else ("" if hm == "?" else "no")
    if hm != "?":
        total_si_no_with_human += 1
        if hm_ab == v35_maj:
            resolved_count += 1

    print(f"{pid[:36]:<38} {si30:>7} {no30:>7} {si35:>7} {no35:>7} {hm_ab:>6} {resolved:>10}")

print()
print(f"SI↔N/O resolution rate (v3.5 majority matches human): {resolved_count}/{total_si_no_with_human} ({resolved_count / total_si_no_with_human * 100:.1f}%)" if total_si_no_with_human else "No human labels for SI↔N/O paragraphs")

# 23:0 asymmetry check
print()
print("23:0 asymmetry check:")
# In v3.0, how many SI↔N/O paragraphs had human=SI but GenAI majority=N/O?
asym_30 = sum(
    1 for pid in si_no_pids
    if human_majority.get(pid) == "Strategy Integration" and majority_vote(v30, pid) == "None/Other"
)
asym_35 = sum(
    1 for pid in si_no_pids
    if human_majority.get(pid) == "Strategy Integration" and majority_vote(v35, pid) == "None/Other"
)
print(f"  v3.0: Human=SI but GenAI majority=N/O: {asym_30}")
print(f"  v3.5: Human=SI but GenAI majority=N/O: {asym_35}")
rev_30 = sum(
    1 for pid in si_no_pids
    if human_majority.get(pid) == "None/Other" and majority_vote(v30, pid) == "Strategy Integration"
)
rev_35 = sum(
    1 for pid in si_no_pids
    if human_majority.get(pid) == "None/Other" and majority_vote(v35, pid) == "Strategy Integration"
)
print(f"  v3.0: Human=N/O but GenAI majority=SI: {rev_30}")
print(f"  v3.5: Human=N/O but GenAI majority=SI: {rev_35}")

print()

# ── Section 5: Per-model quality on confusion axes ───────────────────────────

print("=" * 80)
print("5. PER-MODEL ACCURACY ON CONFUSION-AXIS PARAGRAPHS (vs human majority)")
print("=" * 80)
print()

model_results = []
for mname in MODEL_NAMES:
    correct_30 = 0
    correct_35 = 0
    total = 0
    for pid in holdout_pids:
        hm = human_majority.get(pid)
        c30 = v30[mname].get(pid)
        c35 = v35[mname].get(pid)
        if hm and c30 and c35:
            total += 1
            if c30 == hm:
                correct_30 += 1
            if c35 == hm:
                correct_35 += 1
    acc30 = correct_30 / total * 100 if total else 0
    acc35 = correct_35 / total * 100 if total else 0
    model_results.append((mname, total, acc30, acc35, acc35 - acc30))

# Sort by v3.5 accuracy descending
model_results.sort(key=lambda x: -x[3])

header = f"{'Rank':>4} {'Model':<18} {'n':>5} {'v3.0 Acc':>9} {'v3.5 Acc':>9} {'Delta':>8}"
print(header)
print("-" * len(header))
for rank, (mname, total, acc30, acc35, delta) in enumerate(model_results, 1):
    print(f"{rank:>4} {mname:<18} {total:>5} {acc30:>8.1f}% {acc35:>8.1f}% {delta:>+7.1f}pp")

print()

# ── Section 6: Model convergence ─────────────────────────────────────────────

print("=" * 80)
print("6. MODEL CONVERGENCE (pairwise agreement)")
print("=" * 80)
print()

v30_avg = agreement_rate(v30, holdout_pids)
v35_avg = agreement_rate(v35, holdout_pids)

print(f"Average pairwise agreement among 7 models:")
print(f"  v3.0: {v30_avg:.3f}")
print(f"  v3.5: {v35_avg:.3f}")
print(f"  Delta: {v35_avg - v30_avg:+.3f}")
print()

# Per-model average agreement with others
print("Per-model average agreement with other 6 models:")
header = f"{'Model':<18} {'v3.0':>8} {'v3.5':>8} {'Delta':>8}"
print(header)
print("-" * len(header))

v30_mat = pairwise_agreement_matrix(v30, holdout_pids)
v35_mat = pairwise_agreement_matrix(v35, holdout_pids)

for i, mname in enumerate(MODEL_NAMES):
    # Average agreement with other models (exclude self)
    others_30 = [v30_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
    others_35 = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
    avg30 = np.mean(others_30)
    avg35 = np.mean(others_35)
    print(f"{mname:<18} {avg30:>7.3f} {avg35:>7.3f} {avg35 - avg30:>+7.3f}")

print()

# Outlier detection
print("Outlier check (models with lowest v3.5 agreement):")
v35_avgs = []
for i, mname in enumerate(MODEL_NAMES):
    others = [v35_mat[i, j] for j in range(len(MODEL_NAMES)) if j != i]
    v35_avgs.append((mname, np.mean(others)))

v35_avgs.sort(key=lambda x: x[1])
mean_agree = np.mean([x[1] for x in v35_avgs])
std_agree = np.std([x[1] for x in v35_avgs])

for mname, avg in v35_avgs:
    z = (avg - mean_agree) / std_agree if std_agree > 0 else 0
    flag = " *** OUTLIER" if z < -1.5 else ""
    print(f"  {mname:<18} {avg:.3f} (z={z:+.2f}){flag}")

print()
print("=" * 80)
print("DONE")
print("=" * 80)