SEC-cyBERT/scripts/analyze-v2-bench.py

"""
V2 holdout benchmark analysis.

Compares all models in data/annotations/v2-bench/ on the 1,200 v2 holdout.
Uses GPT-5.4 (v4.5) as reference since it's our best-validated model.

Outputs:
  - Per-model distribution tables (category + specificity)
  - Pairwise agreement matrix (category, specificity, both)
  - Per-model agreement with GPT-5.4 reference
  - Confusion patterns: where models disagree and why
  - Confidence distribution per model
  - Specific facts coverage analysis
"""

import json
import sys
from collections import Counter, defaultdict
from itertools import combinations
from pathlib import Path

import numpy as np

ROOT = Path(__file__).resolve().parent.parent
V2_BENCH = ROOT / "data/annotations/v2-bench"
GOLDEN_DIR = ROOT / "data/annotations/golden"

CATEGORIES = [
    "Board Governance", "Management Role", "Risk Management Process",
    "Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
]
CAT_SHORT = {"Board Governance": "BG", "Management Role": "MR",
             "Risk Management Process": "RMP", "Third-Party Risk": "TP",
             "Incident Disclosure": "ID", "Strategy Integration": "SI",
             "None/Other": "N/O"}
SPEC_LABELS = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}

MODEL_DISPLAY = {
    "gemini-3.1-flash-lite-preview": "Gemini Lite",
    "mimo-v2-flash": "MIMO Flash",
    "grok-4.1-fast": "Grok Fast",
    "gpt-5.4": "GPT-5.4",
    "kimi-k2.5": "Kimi K2.5",
    "gemini-3.1-pro-preview": "Gemini Pro",
    "glm-5": "GLM-5",
    "minimax-m2.7": "MiniMax M2.7",
    "mimo-v2-pro": "MIMO Pro",
}

REFERENCE_MODEL = "gpt-5.4"


def load_jsonl(path: Path) -> list[dict]:
    records = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records


def cohens_kappa(a: list, b: list) -> float:
    assert len(a) == len(b)
    n = len(a)
    if n == 0:
        return 0.0
    labels = sorted(set(a) | set(b))
    idx = {l: i for i, l in enumerate(labels)}
    k = len(labels)
    conf = np.zeros((k, k))
    for x, y in zip(a, b):
        conf[idx[x]][idx[y]] += 1
    po = np.trace(conf) / n
    pe = sum((conf[i, :].sum() / n) * (conf[:, i].sum() / n) for i in range(k))
    if pe >= 1.0:
        return 1.0
    return (po - pe) / (1 - pe)


def weighted_kappa(a: list[int], b: list[int]) -> float:
    """Quadratic-weighted kappa for ordinal specificity."""
    assert len(a) == len(b)
    n = len(a)
    if n == 0:
        return 0.0
    labels = sorted(set(a) | set(b))
    idx = {l: i for i, l in enumerate(labels)}
    k = len(labels)
    conf = np.zeros((k, k))
    for x, y in zip(a, b):
        conf[idx[x]][idx[y]] += 1
    weights = np.zeros((k, k))
    for i in range(k):
        for j in range(k):
            weights[i][j] = (i - j) ** 2 / (k - 1) ** 2
    po = 1 - np.sum(weights * conf) / n
    expected = np.outer(conf.sum(axis=1), conf.sum(axis=0)) / n
    pe = 1 - np.sum(weights * expected) / n
    if pe == 0:
        return 1.0
    return (po - pe) / (1 - pe)


# ── Load all models ──
print("Loading v2-bench annotations...")

models: dict[str, dict[str, dict]] = {}  # model_short -> {pid -> annotation}
for f in sorted(V2_BENCH.glob("*.jsonl")):
    if "errors" in f.name or f.stem.startswith("gpt-5.4.v4"):
        continue
    records = load_jsonl(f)
    if len(records) < 100:
        print(f"  SKIP {f.name}: only {len(records)} records")
        continue
    model_short = f.stem
    by_pid = {r["paragraphId"]: r for r in records}
    models[model_short] = by_pid
    display = MODEL_DISPLAY.get(model_short, model_short)
    print(f"  {display}: {len(by_pid)} annotations")

# Load Opus golden if available
opus_path = GOLDEN_DIR / "opus.jsonl"
if opus_path.exists():
    records = load_jsonl(opus_path)
    if len(records) >= 100:
        by_pid = {r["paragraphId"]: r for r in records}
        models["opus-4.6"] = by_pid
        MODEL_DISPLAY["opus-4.6"] = "Opus 4.6"
        print(f"  Opus 4.6: {len(by_pid)} annotations")

# Common paragraph IDs across all models
all_pids = set.intersection(*(set(m.keys()) for m in models.values())) if models else set()
print(f"\n  {len(all_pids)} paragraphs common to all {len(models)} models")

if not all_pids:
    # Fall back to pairwise with reference
    ref = models.get(REFERENCE_MODEL)
    if ref:
        all_pids = set(ref.keys())
        print(f"  Using {len(all_pids)} reference model paragraphs for pairwise analysis")

model_names = sorted(models.keys(), key=lambda m: list(MODEL_DISPLAY.keys()).index(m) if m in MODEL_DISPLAY else 999)


def get_label(model: str, pid: str) -> dict | None:
    ann = models.get(model, {}).get(pid)
    if not ann:
        return None
    return ann.get("label", ann)


# ═══════════════════════════════════════════════════════════
# 1. DISTRIBUTION TABLES
# ═══════════════════════════════════════════════════════════
print("\n" + "═" * 70)
print("CATEGORY DISTRIBUTION")
print("═" * 70)

header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in CAT_SHORT.values())
print(header)
print("─" * len(header))

for m in model_names:
    display = MODEL_DISPLAY.get(m, m)[:15]
    cats = [get_label(m, pid) for pid in models[m]]
    cats = [l["content_category"] for l in cats if l]
    counts = Counter(cats)
    total = len(cats)
    row = f"{display:<16}"
    for full_name in CATEGORIES:
        pct = counts.get(full_name, 0) / total * 100 if total else 0
        row += f"{pct:>7.1f}%"
    print(row)

print("\n" + "═" * 70)
print("SPECIFICITY DISTRIBUTION")
print("═" * 70)

header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in SPEC_LABELS.values()) + f"{'Med%':>8}"
print(header)
print("─" * len(header))

for m in model_names:
    display = MODEL_DISPLAY.get(m, m)[:15]
    labels = [get_label(m, pid) for pid in models[m]]
    specs = [l["specificity_level"] for l in labels if l]
    confs = [l.get("specificity_confidence", "high") for l in labels if l]
    counts = Counter(specs)
    total = len(specs)
    med_count = sum(1 for c in confs if c == "medium")
    row = f"{display:<16}"
    for level in SPEC_LABELS:
        pct = counts.get(level, 0) / total * 100 if total else 0
        row += f"{pct:>7.1f}%"
    med_pct = med_count / total * 100 if total else 0
    row += f"{med_pct:>7.1f}%"
    print(row)

# ═══════════════════════════════════════════════════════════
# 2. AGREEMENT WITH REFERENCE
# ═══════════════════════════════════════════════════════════
ref_data = models.get(REFERENCE_MODEL)
if ref_data:
    print("\n" + "═" * 70)
    print(f"AGREEMENT WITH {MODEL_DISPLAY.get(REFERENCE_MODEL, REFERENCE_MODEL).upper()}")
    print("═" * 70)

    header = f"{'Model':<16}{'Cat%':>8}{'Cat κ':>8}{'Spec%':>8}{'Spec κw':>8}{'Both%':>8}{'N':>6}"
    print(header)
    print("─" * len(header))

    for m in model_names:
        if m == REFERENCE_MODEL:
            continue
        display = MODEL_DISPLAY.get(m, m)[:15]
        common = set(models[m].keys()) & set(ref_data.keys())
        if len(common) < 100:
            print(f"{display:<16}  (only {len(common)} common paragraphs)")
            continue

        ref_cats, m_cats = [], []
        ref_specs, m_specs = [], []
        both_match = 0

        for pid in common:
            rl = get_label(REFERENCE_MODEL, pid)
            ml = get_label(m, pid)
            if not rl or not ml:
                continue
            ref_cats.append(rl["content_category"])
            m_cats.append(ml["content_category"])
            ref_specs.append(rl["specificity_level"])
            m_specs.append(ml["specificity_level"])
            if rl["content_category"] == ml["content_category"] and rl["specificity_level"] == ml["specificity_level"]:
                both_match += 1

        n = len(ref_cats)
        cat_agree = sum(1 for a, b in zip(ref_cats, m_cats) if a == b) / n * 100
        spec_agree = sum(1 for a, b in zip(ref_specs, m_specs) if a == b) / n * 100
        both_pct = both_match / n * 100
        cat_k = cohens_kappa(ref_cats, m_cats)
        spec_kw = weighted_kappa(ref_specs, m_specs)

        print(f"{display:<16}{cat_agree:>7.1f}%{cat_k:>8.3f}{spec_agree:>7.1f}%{spec_kw:>8.3f}{both_pct:>7.1f}%{n:>6}")

# ═══════════════════════════════════════════════════════════
# 3. PAIRWISE AGREEMENT MATRIX (category kappa)
# ═══════════════════════════════════════════════════════════
print("\n" + "═" * 70)
print("PAIRWISE CATEGORY κ (lower triangle)")
print("═" * 70)

short_names = [MODEL_DISPLAY.get(m, m)[:10] for m in model_names]
header = f"{'':>12}" + "".join(f"{s:>12}" for s in short_names)
print(header)

for i, m1 in enumerate(model_names):
    row = f"{short_names[i]:>12}"
    for j, m2 in enumerate(model_names):
        if j >= i:
            row += f"{'—':>12}"
            continue
        common = set(models[m1].keys()) & set(models[m2].keys())
        if len(common) < 100:
            row += f"{'n/a':>12}"
            continue
        cats1 = [get_label(m1, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)]
        cats2 = [get_label(m2, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)]
        k = cohens_kappa(cats1, cats2)
        row += f"{k:>12.3f}"
    print(row)

# ═══════════════════════════════════════════════════════════
# 4. SPECIFICITY CONFUSION WITH REFERENCE
# ═══════════════════════════════════════════════════════════
if ref_data:
    print("\n" + "═" * 70)
    print("SPECIFICITY CONFUSION vs REFERENCE (rows=model, cols=reference)")
    print("═" * 70)

    for m in model_names:
        if m == REFERENCE_MODEL:
            continue
        display = MODEL_DISPLAY.get(m, m)
        common = set(models[m].keys()) & set(ref_data.keys())
        if len(common) < 100:
            continue

        conf = np.zeros((4, 4), dtype=int)
        for pid in common:
            rl = get_label(REFERENCE_MODEL, pid)
            ml = get_label(m, pid)
            if not rl or not ml:
                continue
            ref_s = rl["specificity_level"] - 1
            mod_s = ml["specificity_level"] - 1
            conf[mod_s][ref_s] += 1

        print(f"\n  {display} (N={int(conf.sum())})")
        print(f"  {'':>8}" + "".join(f"{'ref ' + SPEC_LABELS[l]:>8}" for l in range(1, 5)))
        for i in range(4):
            row_total = conf[i].sum()
            row = f"  {SPEC_LABELS[i+1]:>8}"
            for j in range(4):
                row += f"{conf[i][j]:>8}"
            print(row + f"  | {row_total}")

# ═══════════════════════════════════════════════════════════
# 5. CATEGORY DISAGREEMENT PATTERNS
# ═══════════════════════════════════════════════════════════
if ref_data:
    print("\n" + "═" * 70)
    print("TOP CATEGORY DISAGREEMENT PATTERNS vs REFERENCE")
    print("═" * 70)

    for m in model_names:
        if m == REFERENCE_MODEL:
            continue
        display = MODEL_DISPLAY.get(m, m)
        common = set(models[m].keys()) & set(ref_data.keys())
        if len(common) < 100:
            continue

        disagreements: Counter = Counter()
        for pid in common:
            rl = get_label(REFERENCE_MODEL, pid)
            ml = get_label(m, pid)
            if not rl or not ml:
                continue
            rc = CAT_SHORT[rl["content_category"]]
            mc = CAT_SHORT[ml["content_category"]]
            if rc != mc:
                disagreements[(rc, mc)] += 1

        total_disagree = sum(disagreements.values())
        if total_disagree == 0:
            continue

        print(f"\n  {display}: {total_disagree} disagreements ({total_disagree/len(common)*100:.1f}%)")
        for (rc, mc), count in disagreements.most_common(5):
            print(f"    {rc} → {mc}: {count}")

# ═══════════════════════════════════════════════════════════
# 6. SPECIFIC_FACTS COVERAGE
# ═══════════════════════════════════════════════════════════
print("\n" + "═" * 70)
print("SPECIFIC_FACTS COVERAGE")
print("═" * 70)

header = f"{'Model':<16}{'Has facts':>10}{'Avg #':>8}{'L1 empty':>10}{'L2+ has':>10}"
print(header)
print("─" * len(header))

for m in model_names:
    display = MODEL_DISPLAY.get(m, m)[:15]
    has_facts = 0
    total_facts = 0
    l1_empty = 0
    l1_total = 0
    l2plus_has = 0
    l2plus_total = 0

    for pid in models[m]:
        l = get_label(m, pid)
        if not l:
            continue
        facts = l.get("specific_facts") or []
        spec = l["specificity_level"]

        if facts:
            has_facts += 1
            total_facts += len(facts)

        if spec == 1:
            l1_total += 1
            if not facts:
                l1_empty += 1
        else:
            l2plus_total += 1
            if facts:
                l2plus_has += 1

    total = len(models[m])
    print(f"{display:<16}"
          f"{has_facts/total*100:>9.1f}%"
          f"{total_facts/max(1,has_facts):>8.1f}"
          f"{l1_empty/max(1,l1_total)*100:>9.1f}%"
          f"{l2plus_has/max(1,l2plus_total)*100:>9.1f}%")

# ═══════════════════════════════════════════════════════════
# 7. MULTI-MODEL CONSENSUS ANALYSIS
# ═══════════════════════════════════════════════════════════
print("\n" + "═" * 70)
print("MULTI-MODEL CONSENSUS")
print("═" * 70)

# For paragraphs common to all models
if len(all_pids) >= 100:
    cat_unanimous = 0
    spec_unanimous = 0
    both_unanimous = 0
    cat_majority = 0
    spec_majority = 0

    for pid in all_pids:
        cats = []
        specs = []
        for m in model_names:
            l = get_label(m, pid)
            if l:
                cats.append(l["content_category"])
                specs.append(l["specificity_level"])

        cat_counts = Counter(cats)
        spec_counts = Counter(specs)
        top_cat_n = cat_counts.most_common(1)[0][1]
        top_spec_n = spec_counts.most_common(1)[0][1]

        if len(set(cats)) == 1:
            cat_unanimous += 1
        if len(set(specs)) == 1:
            spec_unanimous += 1
        if len(set(cats)) == 1 and len(set(specs)) == 1:
            both_unanimous += 1
        if top_cat_n > len(cats) / 2:
            cat_majority += 1
        if top_spec_n > len(specs) / 2:
            spec_majority += 1

    n = len(all_pids)
    print(f"  Category unanimous:   {cat_unanimous}/{n} ({cat_unanimous/n*100:.1f}%)")
    print(f"  Category majority:    {cat_majority}/{n} ({cat_majority/n*100:.1f}%)")
    print(f"  Specificity unanimous: {spec_unanimous}/{n} ({spec_unanimous/n*100:.1f}%)")
    print(f"  Specificity majority:  {spec_majority}/{n} ({spec_majority/n*100:.1f}%)")
    print(f"  Both unanimous:        {both_unanimous}/{n} ({both_unanimous/n*100:.1f}%)")
else:
    print(f"  Only {len(all_pids)} common paragraphs — skipping full consensus")
    print("  (Some models may still be running)")

# ═══════════════════════════════════════════════════════════
# 8. COST SUMMARY
# ═══════════════════════════════════════════════════════════
print("\n" + "═" * 70)
print("COST & LATENCY SUMMARY")
print("═" * 70)

header = f"{'Model':<16}{'Cost':>10}{'Avg ms':>10}{'Tokens/p':>10}{'Reason/p':>10}"
print(header)
print("─" * len(header))

total_cost = 0
for m in model_names:
    display = MODEL_DISPLAY.get(m, m)[:15]
    costs = []
    latencies = []
    reasoning = []
    for pid in models[m]:
        ann = models[m][pid]
        prov = ann.get("provenance", {})
        costs.append(prov.get("costUsd", 0))
        latencies.append(prov.get("latencyMs", 0))
        reasoning.append(prov.get("reasoningTokens", 0))

    cost = sum(costs)
    total_cost += cost
    avg_lat = np.mean(latencies) if latencies else 0
    avg_tok = np.mean([c.get("provenance", {}).get("inputTokens", 0) + c.get("provenance", {}).get("outputTokens", 0) for c in models[m].values()])
    avg_reason = np.mean(reasoning) if reasoning else 0

    print(f"{display:<16}${cost:>9.4f}{avg_lat:>9.0f}ms{avg_tok:>10.0f}{avg_reason:>10.0f}")

print(f"\n  Total benchmark cost: ${total_cost:.4f}")

print("\n" + "═" * 70)
print("DONE")
print("═" * 70)