SEC-cyBERT/scripts/analyze-v2-bench.py
2026-04-04 22:49:24 -04:00

478 lines
18 KiB
Python

"""
V2 holdout benchmark analysis.
Compares all models in data/annotations/v2-bench/ on the 1,200 v2 holdout.
Uses GPT-5.4 (v4.5) as reference since it's our best-validated model.
Outputs:
- Per-model distribution tables (category + specificity)
- Pairwise agreement matrix (category, specificity, both)
- Per-model agreement with GPT-5.4 reference
- Confusion patterns: where models disagree and why
- Confidence distribution per model
- Specific facts coverage analysis
"""
import json
import sys
from collections import Counter, defaultdict
from itertools import combinations
from pathlib import Path
import numpy as np
ROOT = Path(__file__).resolve().parent.parent
V2_BENCH = ROOT / "data/annotations/v2-bench"
GOLDEN_DIR = ROOT / "data/annotations/golden"
CATEGORIES = [
"Board Governance", "Management Role", "Risk Management Process",
"Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
]
CAT_SHORT = {"Board Governance": "BG", "Management Role": "MR",
"Risk Management Process": "RMP", "Third-Party Risk": "TP",
"Incident Disclosure": "ID", "Strategy Integration": "SI",
"None/Other": "N/O"}
SPEC_LABELS = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}
MODEL_DISPLAY = {
"gemini-3.1-flash-lite-preview": "Gemini Lite",
"mimo-v2-flash": "MIMO Flash",
"grok-4.1-fast": "Grok Fast",
"gpt-5.4": "GPT-5.4",
"kimi-k2.5": "Kimi K2.5",
"gemini-3.1-pro-preview": "Gemini Pro",
"glm-5": "GLM-5",
"minimax-m2.7": "MiniMax M2.7",
"mimo-v2-pro": "MIMO Pro",
}
REFERENCE_MODEL = "gpt-5.4"
def load_jsonl(path: Path) -> list[dict]:
records = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
return records
def cohens_kappa(a: list, b: list) -> float:
assert len(a) == len(b)
n = len(a)
if n == 0:
return 0.0
labels = sorted(set(a) | set(b))
idx = {l: i for i, l in enumerate(labels)}
k = len(labels)
conf = np.zeros((k, k))
for x, y in zip(a, b):
conf[idx[x]][idx[y]] += 1
po = np.trace(conf) / n
pe = sum((conf[i, :].sum() / n) * (conf[:, i].sum() / n) for i in range(k))
if pe >= 1.0:
return 1.0
return (po - pe) / (1 - pe)
def weighted_kappa(a: list[int], b: list[int]) -> float:
"""Quadratic-weighted kappa for ordinal specificity."""
assert len(a) == len(b)
n = len(a)
if n == 0:
return 0.0
labels = sorted(set(a) | set(b))
idx = {l: i for i, l in enumerate(labels)}
k = len(labels)
conf = np.zeros((k, k))
for x, y in zip(a, b):
conf[idx[x]][idx[y]] += 1
weights = np.zeros((k, k))
for i in range(k):
for j in range(k):
weights[i][j] = (i - j) ** 2 / (k - 1) ** 2
po = 1 - np.sum(weights * conf) / n
expected = np.outer(conf.sum(axis=1), conf.sum(axis=0)) / n
pe = 1 - np.sum(weights * expected) / n
if pe == 0:
return 1.0
return (po - pe) / (1 - pe)
# ── Load all models ──
print("Loading v2-bench annotations...")
models: dict[str, dict[str, dict]] = {} # model_short -> {pid -> annotation}
for f in sorted(V2_BENCH.glob("*.jsonl")):
if "errors" in f.name or f.stem.startswith("gpt-5.4.v4"):
continue
records = load_jsonl(f)
if len(records) < 100:
print(f" SKIP {f.name}: only {len(records)} records")
continue
model_short = f.stem
by_pid = {r["paragraphId"]: r for r in records}
models[model_short] = by_pid
display = MODEL_DISPLAY.get(model_short, model_short)
print(f" {display}: {len(by_pid)} annotations")
# Load Opus golden if available
opus_path = GOLDEN_DIR / "opus.jsonl"
if opus_path.exists():
records = load_jsonl(opus_path)
if len(records) >= 100:
by_pid = {r["paragraphId"]: r for r in records}
models["opus-4.6"] = by_pid
MODEL_DISPLAY["opus-4.6"] = "Opus 4.6"
print(f" Opus 4.6: {len(by_pid)} annotations")
# Common paragraph IDs across all models
all_pids = set.intersection(*(set(m.keys()) for m in models.values())) if models else set()
print(f"\n {len(all_pids)} paragraphs common to all {len(models)} models")
if not all_pids:
# Fall back to pairwise with reference
ref = models.get(REFERENCE_MODEL)
if ref:
all_pids = set(ref.keys())
print(f" Using {len(all_pids)} reference model paragraphs for pairwise analysis")
model_names = sorted(models.keys(), key=lambda m: list(MODEL_DISPLAY.keys()).index(m) if m in MODEL_DISPLAY else 999)
def get_label(model: str, pid: str) -> dict | None:
ann = models.get(model, {}).get(pid)
if not ann:
return None
return ann.get("label", ann)
# ═══════════════════════════════════════════════════════════
# 1. DISTRIBUTION TABLES
# ═══════════════════════════════════════════════════════════
print("\n" + "" * 70)
print("CATEGORY DISTRIBUTION")
print("" * 70)
header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in CAT_SHORT.values())
print(header)
print("" * len(header))
for m in model_names:
display = MODEL_DISPLAY.get(m, m)[:15]
cats = [get_label(m, pid) for pid in models[m]]
cats = [l["content_category"] for l in cats if l]
counts = Counter(cats)
total = len(cats)
row = f"{display:<16}"
for full_name in CATEGORIES:
pct = counts.get(full_name, 0) / total * 100 if total else 0
row += f"{pct:>7.1f}%"
print(row)
print("\n" + "" * 70)
print("SPECIFICITY DISTRIBUTION")
print("" * 70)
header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in SPEC_LABELS.values()) + f"{'Med%':>8}"
print(header)
print("" * len(header))
for m in model_names:
display = MODEL_DISPLAY.get(m, m)[:15]
labels = [get_label(m, pid) for pid in models[m]]
specs = [l["specificity_level"] for l in labels if l]
confs = [l.get("specificity_confidence", "high") for l in labels if l]
counts = Counter(specs)
total = len(specs)
med_count = sum(1 for c in confs if c == "medium")
row = f"{display:<16}"
for level in SPEC_LABELS:
pct = counts.get(level, 0) / total * 100 if total else 0
row += f"{pct:>7.1f}%"
med_pct = med_count / total * 100 if total else 0
row += f"{med_pct:>7.1f}%"
print(row)
# ═══════════════════════════════════════════════════════════
# 2. AGREEMENT WITH REFERENCE
# ═══════════════════════════════════════════════════════════
ref_data = models.get(REFERENCE_MODEL)
if ref_data:
print("\n" + "" * 70)
print(f"AGREEMENT WITH {MODEL_DISPLAY.get(REFERENCE_MODEL, REFERENCE_MODEL).upper()}")
print("" * 70)
header = f"{'Model':<16}{'Cat%':>8}{'Cat κ':>8}{'Spec%':>8}{'Spec κw':>8}{'Both%':>8}{'N':>6}"
print(header)
print("" * len(header))
for m in model_names:
if m == REFERENCE_MODEL:
continue
display = MODEL_DISPLAY.get(m, m)[:15]
common = set(models[m].keys()) & set(ref_data.keys())
if len(common) < 100:
print(f"{display:<16} (only {len(common)} common paragraphs)")
continue
ref_cats, m_cats = [], []
ref_specs, m_specs = [], []
both_match = 0
for pid in common:
rl = get_label(REFERENCE_MODEL, pid)
ml = get_label(m, pid)
if not rl or not ml:
continue
ref_cats.append(rl["content_category"])
m_cats.append(ml["content_category"])
ref_specs.append(rl["specificity_level"])
m_specs.append(ml["specificity_level"])
if rl["content_category"] == ml["content_category"] and rl["specificity_level"] == ml["specificity_level"]:
both_match += 1
n = len(ref_cats)
cat_agree = sum(1 for a, b in zip(ref_cats, m_cats) if a == b) / n * 100
spec_agree = sum(1 for a, b in zip(ref_specs, m_specs) if a == b) / n * 100
both_pct = both_match / n * 100
cat_k = cohens_kappa(ref_cats, m_cats)
spec_kw = weighted_kappa(ref_specs, m_specs)
print(f"{display:<16}{cat_agree:>7.1f}%{cat_k:>8.3f}{spec_agree:>7.1f}%{spec_kw:>8.3f}{both_pct:>7.1f}%{n:>6}")
# ═══════════════════════════════════════════════════════════
# 3. PAIRWISE AGREEMENT MATRIX (category kappa)
# ═══════════════════════════════════════════════════════════
print("\n" + "" * 70)
print("PAIRWISE CATEGORY κ (lower triangle)")
print("" * 70)
short_names = [MODEL_DISPLAY.get(m, m)[:10] for m in model_names]
header = f"{'':>12}" + "".join(f"{s:>12}" for s in short_names)
print(header)
for i, m1 in enumerate(model_names):
row = f"{short_names[i]:>12}"
for j, m2 in enumerate(model_names):
if j >= i:
row += f"{'':>12}"
continue
common = set(models[m1].keys()) & set(models[m2].keys())
if len(common) < 100:
row += f"{'n/a':>12}"
continue
cats1 = [get_label(m1, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)]
cats2 = [get_label(m2, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)]
k = cohens_kappa(cats1, cats2)
row += f"{k:>12.3f}"
print(row)
# ═══════════════════════════════════════════════════════════
# 4. SPECIFICITY CONFUSION WITH REFERENCE
# ═══════════════════════════════════════════════════════════
if ref_data:
print("\n" + "" * 70)
print("SPECIFICITY CONFUSION vs REFERENCE (rows=model, cols=reference)")
print("" * 70)
for m in model_names:
if m == REFERENCE_MODEL:
continue
display = MODEL_DISPLAY.get(m, m)
common = set(models[m].keys()) & set(ref_data.keys())
if len(common) < 100:
continue
conf = np.zeros((4, 4), dtype=int)
for pid in common:
rl = get_label(REFERENCE_MODEL, pid)
ml = get_label(m, pid)
if not rl or not ml:
continue
ref_s = rl["specificity_level"] - 1
mod_s = ml["specificity_level"] - 1
conf[mod_s][ref_s] += 1
print(f"\n {display} (N={int(conf.sum())})")
print(f" {'':>8}" + "".join(f"{'ref ' + SPEC_LABELS[l]:>8}" for l in range(1, 5)))
for i in range(4):
row_total = conf[i].sum()
row = f" {SPEC_LABELS[i+1]:>8}"
for j in range(4):
row += f"{conf[i][j]:>8}"
print(row + f" | {row_total}")
# ═══════════════════════════════════════════════════════════
# 5. CATEGORY DISAGREEMENT PATTERNS
# ═══════════════════════════════════════════════════════════
if ref_data:
print("\n" + "" * 70)
print("TOP CATEGORY DISAGREEMENT PATTERNS vs REFERENCE")
print("" * 70)
for m in model_names:
if m == REFERENCE_MODEL:
continue
display = MODEL_DISPLAY.get(m, m)
common = set(models[m].keys()) & set(ref_data.keys())
if len(common) < 100:
continue
disagreements: Counter = Counter()
for pid in common:
rl = get_label(REFERENCE_MODEL, pid)
ml = get_label(m, pid)
if not rl or not ml:
continue
rc = CAT_SHORT[rl["content_category"]]
mc = CAT_SHORT[ml["content_category"]]
if rc != mc:
disagreements[(rc, mc)] += 1
total_disagree = sum(disagreements.values())
if total_disagree == 0:
continue
print(f"\n {display}: {total_disagree} disagreements ({total_disagree/len(common)*100:.1f}%)")
for (rc, mc), count in disagreements.most_common(5):
print(f" {rc}{mc}: {count}")
# ═══════════════════════════════════════════════════════════
# 6. SPECIFIC_FACTS COVERAGE
# ═══════════════════════════════════════════════════════════
print("\n" + "" * 70)
print("SPECIFIC_FACTS COVERAGE")
print("" * 70)
header = f"{'Model':<16}{'Has facts':>10}{'Avg #':>8}{'L1 empty':>10}{'L2+ has':>10}"
print(header)
print("" * len(header))
for m in model_names:
display = MODEL_DISPLAY.get(m, m)[:15]
has_facts = 0
total_facts = 0
l1_empty = 0
l1_total = 0
l2plus_has = 0
l2plus_total = 0
for pid in models[m]:
l = get_label(m, pid)
if not l:
continue
facts = l.get("specific_facts") or []
spec = l["specificity_level"]
if facts:
has_facts += 1
total_facts += len(facts)
if spec == 1:
l1_total += 1
if not facts:
l1_empty += 1
else:
l2plus_total += 1
if facts:
l2plus_has += 1
total = len(models[m])
print(f"{display:<16}"
f"{has_facts/total*100:>9.1f}%"
f"{total_facts/max(1,has_facts):>8.1f}"
f"{l1_empty/max(1,l1_total)*100:>9.1f}%"
f"{l2plus_has/max(1,l2plus_total)*100:>9.1f}%")
# ═══════════════════════════════════════════════════════════
# 7. MULTI-MODEL CONSENSUS ANALYSIS
# ═══════════════════════════════════════════════════════════
print("\n" + "" * 70)
print("MULTI-MODEL CONSENSUS")
print("" * 70)
# For paragraphs common to all models
if len(all_pids) >= 100:
cat_unanimous = 0
spec_unanimous = 0
both_unanimous = 0
cat_majority = 0
spec_majority = 0
for pid in all_pids:
cats = []
specs = []
for m in model_names:
l = get_label(m, pid)
if l:
cats.append(l["content_category"])
specs.append(l["specificity_level"])
cat_counts = Counter(cats)
spec_counts = Counter(specs)
top_cat_n = cat_counts.most_common(1)[0][1]
top_spec_n = spec_counts.most_common(1)[0][1]
if len(set(cats)) == 1:
cat_unanimous += 1
if len(set(specs)) == 1:
spec_unanimous += 1
if len(set(cats)) == 1 and len(set(specs)) == 1:
both_unanimous += 1
if top_cat_n > len(cats) / 2:
cat_majority += 1
if top_spec_n > len(specs) / 2:
spec_majority += 1
n = len(all_pids)
print(f" Category unanimous: {cat_unanimous}/{n} ({cat_unanimous/n*100:.1f}%)")
print(f" Category majority: {cat_majority}/{n} ({cat_majority/n*100:.1f}%)")
print(f" Specificity unanimous: {spec_unanimous}/{n} ({spec_unanimous/n*100:.1f}%)")
print(f" Specificity majority: {spec_majority}/{n} ({spec_majority/n*100:.1f}%)")
print(f" Both unanimous: {both_unanimous}/{n} ({both_unanimous/n*100:.1f}%)")
else:
print(f" Only {len(all_pids)} common paragraphs — skipping full consensus")
print(" (Some models may still be running)")
# ═══════════════════════════════════════════════════════════
# 8. COST SUMMARY
# ═══════════════════════════════════════════════════════════
print("\n" + "" * 70)
print("COST & LATENCY SUMMARY")
print("" * 70)
header = f"{'Model':<16}{'Cost':>10}{'Avg ms':>10}{'Tokens/p':>10}{'Reason/p':>10}"
print(header)
print("" * len(header))
total_cost = 0
for m in model_names:
display = MODEL_DISPLAY.get(m, m)[:15]
costs = []
latencies = []
reasoning = []
for pid in models[m]:
ann = models[m][pid]
prov = ann.get("provenance", {})
costs.append(prov.get("costUsd", 0))
latencies.append(prov.get("latencyMs", 0))
reasoning.append(prov.get("reasoningTokens", 0))
cost = sum(costs)
total_cost += cost
avg_lat = np.mean(latencies) if latencies else 0
avg_tok = np.mean([c.get("provenance", {}).get("inputTokens", 0) + c.get("provenance", {}).get("outputTokens", 0) for c in models[m].values()])
avg_reason = np.mean(reasoning) if reasoning else 0
print(f"{display:<16}${cost:>9.4f}{avg_lat:>9.0f}ms{avg_tok:>10.0f}{avg_reason:>10.0f}")
print(f"\n Total benchmark cost: ${total_cost:.4f}")
print("\n" + "" * 70)
print("DONE")
print("" * 70)