478 lines
18 KiB
Python
478 lines
18 KiB
Python
"""
|
|
V2 holdout benchmark analysis.
|
|
|
|
Compares all models in data/annotations/v2-bench/ on the 1,200 v2 holdout.
|
|
Uses GPT-5.4 (v4.5) as reference since it's our best-validated model.
|
|
|
|
Outputs:
|
|
- Per-model distribution tables (category + specificity)
|
|
- Pairwise agreement matrix (category, specificity, both)
|
|
- Per-model agreement with GPT-5.4 reference
|
|
- Confusion patterns: where models disagree and why
|
|
- Confidence distribution per model
|
|
- Specific facts coverage analysis
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from itertools import combinations
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
V2_BENCH = ROOT / "data/annotations/v2-bench"
|
|
GOLDEN_DIR = ROOT / "data/annotations/golden"
|
|
|
|
CATEGORIES = [
|
|
"Board Governance", "Management Role", "Risk Management Process",
|
|
"Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
|
|
]
|
|
CAT_SHORT = {"Board Governance": "BG", "Management Role": "MR",
|
|
"Risk Management Process": "RMP", "Third-Party Risk": "TP",
|
|
"Incident Disclosure": "ID", "Strategy Integration": "SI",
|
|
"None/Other": "N/O"}
|
|
SPEC_LABELS = {1: "L1", 2: "L2", 3: "L3", 4: "L4"}
|
|
|
|
MODEL_DISPLAY = {
|
|
"gemini-3.1-flash-lite-preview": "Gemini Lite",
|
|
"mimo-v2-flash": "MIMO Flash",
|
|
"grok-4.1-fast": "Grok Fast",
|
|
"gpt-5.4": "GPT-5.4",
|
|
"kimi-k2.5": "Kimi K2.5",
|
|
"gemini-3.1-pro-preview": "Gemini Pro",
|
|
"glm-5": "GLM-5",
|
|
"minimax-m2.7": "MiniMax M2.7",
|
|
"mimo-v2-pro": "MIMO Pro",
|
|
}
|
|
|
|
REFERENCE_MODEL = "gpt-5.4"
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
records = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
records.append(json.loads(line))
|
|
return records
|
|
|
|
|
|
def cohens_kappa(a: list, b: list) -> float:
|
|
assert len(a) == len(b)
|
|
n = len(a)
|
|
if n == 0:
|
|
return 0.0
|
|
labels = sorted(set(a) | set(b))
|
|
idx = {l: i for i, l in enumerate(labels)}
|
|
k = len(labels)
|
|
conf = np.zeros((k, k))
|
|
for x, y in zip(a, b):
|
|
conf[idx[x]][idx[y]] += 1
|
|
po = np.trace(conf) / n
|
|
pe = sum((conf[i, :].sum() / n) * (conf[:, i].sum() / n) for i in range(k))
|
|
if pe >= 1.0:
|
|
return 1.0
|
|
return (po - pe) / (1 - pe)
|
|
|
|
|
|
def weighted_kappa(a: list[int], b: list[int]) -> float:
|
|
"""Quadratic-weighted kappa for ordinal specificity."""
|
|
assert len(a) == len(b)
|
|
n = len(a)
|
|
if n == 0:
|
|
return 0.0
|
|
labels = sorted(set(a) | set(b))
|
|
idx = {l: i for i, l in enumerate(labels)}
|
|
k = len(labels)
|
|
conf = np.zeros((k, k))
|
|
for x, y in zip(a, b):
|
|
conf[idx[x]][idx[y]] += 1
|
|
weights = np.zeros((k, k))
|
|
for i in range(k):
|
|
for j in range(k):
|
|
weights[i][j] = (i - j) ** 2 / (k - 1) ** 2
|
|
po = 1 - np.sum(weights * conf) / n
|
|
expected = np.outer(conf.sum(axis=1), conf.sum(axis=0)) / n
|
|
pe = 1 - np.sum(weights * expected) / n
|
|
if pe == 0:
|
|
return 1.0
|
|
return (po - pe) / (1 - pe)
|
|
|
|
|
|
# ── Load all models ──
|
|
print("Loading v2-bench annotations...")
|
|
|
|
models: dict[str, dict[str, dict]] = {} # model_short -> {pid -> annotation}
|
|
for f in sorted(V2_BENCH.glob("*.jsonl")):
|
|
if "errors" in f.name or f.stem.startswith("gpt-5.4.v4"):
|
|
continue
|
|
records = load_jsonl(f)
|
|
if len(records) < 100:
|
|
print(f" SKIP {f.name}: only {len(records)} records")
|
|
continue
|
|
model_short = f.stem
|
|
by_pid = {r["paragraphId"]: r for r in records}
|
|
models[model_short] = by_pid
|
|
display = MODEL_DISPLAY.get(model_short, model_short)
|
|
print(f" {display}: {len(by_pid)} annotations")
|
|
|
|
# Load Opus golden if available
|
|
opus_path = GOLDEN_DIR / "opus.jsonl"
|
|
if opus_path.exists():
|
|
records = load_jsonl(opus_path)
|
|
if len(records) >= 100:
|
|
by_pid = {r["paragraphId"]: r for r in records}
|
|
models["opus-4.6"] = by_pid
|
|
MODEL_DISPLAY["opus-4.6"] = "Opus 4.6"
|
|
print(f" Opus 4.6: {len(by_pid)} annotations")
|
|
|
|
# Common paragraph IDs across all models
|
|
all_pids = set.intersection(*(set(m.keys()) for m in models.values())) if models else set()
|
|
print(f"\n {len(all_pids)} paragraphs common to all {len(models)} models")
|
|
|
|
if not all_pids:
|
|
# Fall back to pairwise with reference
|
|
ref = models.get(REFERENCE_MODEL)
|
|
if ref:
|
|
all_pids = set(ref.keys())
|
|
print(f" Using {len(all_pids)} reference model paragraphs for pairwise analysis")
|
|
|
|
model_names = sorted(models.keys(), key=lambda m: list(MODEL_DISPLAY.keys()).index(m) if m in MODEL_DISPLAY else 999)
|
|
|
|
|
|
def get_label(model: str, pid: str) -> dict | None:
|
|
ann = models.get(model, {}).get(pid)
|
|
if not ann:
|
|
return None
|
|
return ann.get("label", ann)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 1. DISTRIBUTION TABLES
|
|
# ═══════════════════════════════════════════════════════════
|
|
print("\n" + "═" * 70)
|
|
print("CATEGORY DISTRIBUTION")
|
|
print("═" * 70)
|
|
|
|
header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in CAT_SHORT.values())
|
|
print(header)
|
|
print("─" * len(header))
|
|
|
|
for m in model_names:
|
|
display = MODEL_DISPLAY.get(m, m)[:15]
|
|
cats = [get_label(m, pid) for pid in models[m]]
|
|
cats = [l["content_category"] for l in cats if l]
|
|
counts = Counter(cats)
|
|
total = len(cats)
|
|
row = f"{display:<16}"
|
|
for full_name in CATEGORIES:
|
|
pct = counts.get(full_name, 0) / total * 100 if total else 0
|
|
row += f"{pct:>7.1f}%"
|
|
print(row)
|
|
|
|
print("\n" + "═" * 70)
|
|
print("SPECIFICITY DISTRIBUTION")
|
|
print("═" * 70)
|
|
|
|
header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in SPEC_LABELS.values()) + f"{'Med%':>8}"
|
|
print(header)
|
|
print("─" * len(header))
|
|
|
|
for m in model_names:
|
|
display = MODEL_DISPLAY.get(m, m)[:15]
|
|
labels = [get_label(m, pid) for pid in models[m]]
|
|
specs = [l["specificity_level"] for l in labels if l]
|
|
confs = [l.get("specificity_confidence", "high") for l in labels if l]
|
|
counts = Counter(specs)
|
|
total = len(specs)
|
|
med_count = sum(1 for c in confs if c == "medium")
|
|
row = f"{display:<16}"
|
|
for level in SPEC_LABELS:
|
|
pct = counts.get(level, 0) / total * 100 if total else 0
|
|
row += f"{pct:>7.1f}%"
|
|
med_pct = med_count / total * 100 if total else 0
|
|
row += f"{med_pct:>7.1f}%"
|
|
print(row)
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 2. AGREEMENT WITH REFERENCE
|
|
# ═══════════════════════════════════════════════════════════
|
|
ref_data = models.get(REFERENCE_MODEL)
|
|
if ref_data:
|
|
print("\n" + "═" * 70)
|
|
print(f"AGREEMENT WITH {MODEL_DISPLAY.get(REFERENCE_MODEL, REFERENCE_MODEL).upper()}")
|
|
print("═" * 70)
|
|
|
|
header = f"{'Model':<16}{'Cat%':>8}{'Cat κ':>8}{'Spec%':>8}{'Spec κw':>8}{'Both%':>8}{'N':>6}"
|
|
print(header)
|
|
print("─" * len(header))
|
|
|
|
for m in model_names:
|
|
if m == REFERENCE_MODEL:
|
|
continue
|
|
display = MODEL_DISPLAY.get(m, m)[:15]
|
|
common = set(models[m].keys()) & set(ref_data.keys())
|
|
if len(common) < 100:
|
|
print(f"{display:<16} (only {len(common)} common paragraphs)")
|
|
continue
|
|
|
|
ref_cats, m_cats = [], []
|
|
ref_specs, m_specs = [], []
|
|
both_match = 0
|
|
|
|
for pid in common:
|
|
rl = get_label(REFERENCE_MODEL, pid)
|
|
ml = get_label(m, pid)
|
|
if not rl or not ml:
|
|
continue
|
|
ref_cats.append(rl["content_category"])
|
|
m_cats.append(ml["content_category"])
|
|
ref_specs.append(rl["specificity_level"])
|
|
m_specs.append(ml["specificity_level"])
|
|
if rl["content_category"] == ml["content_category"] and rl["specificity_level"] == ml["specificity_level"]:
|
|
both_match += 1
|
|
|
|
n = len(ref_cats)
|
|
cat_agree = sum(1 for a, b in zip(ref_cats, m_cats) if a == b) / n * 100
|
|
spec_agree = sum(1 for a, b in zip(ref_specs, m_specs) if a == b) / n * 100
|
|
both_pct = both_match / n * 100
|
|
cat_k = cohens_kappa(ref_cats, m_cats)
|
|
spec_kw = weighted_kappa(ref_specs, m_specs)
|
|
|
|
print(f"{display:<16}{cat_agree:>7.1f}%{cat_k:>8.3f}{spec_agree:>7.1f}%{spec_kw:>8.3f}{both_pct:>7.1f}%{n:>6}")
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 3. PAIRWISE AGREEMENT MATRIX (category kappa)
|
|
# ═══════════════════════════════════════════════════════════
|
|
print("\n" + "═" * 70)
|
|
print("PAIRWISE CATEGORY κ (lower triangle)")
|
|
print("═" * 70)
|
|
|
|
short_names = [MODEL_DISPLAY.get(m, m)[:10] for m in model_names]
|
|
header = f"{'':>12}" + "".join(f"{s:>12}" for s in short_names)
|
|
print(header)
|
|
|
|
for i, m1 in enumerate(model_names):
|
|
row = f"{short_names[i]:>12}"
|
|
for j, m2 in enumerate(model_names):
|
|
if j >= i:
|
|
row += f"{'—':>12}"
|
|
continue
|
|
common = set(models[m1].keys()) & set(models[m2].keys())
|
|
if len(common) < 100:
|
|
row += f"{'n/a':>12}"
|
|
continue
|
|
cats1 = [get_label(m1, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)]
|
|
cats2 = [get_label(m2, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)]
|
|
k = cohens_kappa(cats1, cats2)
|
|
row += f"{k:>12.3f}"
|
|
print(row)
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 4. SPECIFICITY CONFUSION WITH REFERENCE
|
|
# ═══════════════════════════════════════════════════════════
|
|
if ref_data:
|
|
print("\n" + "═" * 70)
|
|
print("SPECIFICITY CONFUSION vs REFERENCE (rows=model, cols=reference)")
|
|
print("═" * 70)
|
|
|
|
for m in model_names:
|
|
if m == REFERENCE_MODEL:
|
|
continue
|
|
display = MODEL_DISPLAY.get(m, m)
|
|
common = set(models[m].keys()) & set(ref_data.keys())
|
|
if len(common) < 100:
|
|
continue
|
|
|
|
conf = np.zeros((4, 4), dtype=int)
|
|
for pid in common:
|
|
rl = get_label(REFERENCE_MODEL, pid)
|
|
ml = get_label(m, pid)
|
|
if not rl or not ml:
|
|
continue
|
|
ref_s = rl["specificity_level"] - 1
|
|
mod_s = ml["specificity_level"] - 1
|
|
conf[mod_s][ref_s] += 1
|
|
|
|
print(f"\n {display} (N={int(conf.sum())})")
|
|
print(f" {'':>8}" + "".join(f"{'ref ' + SPEC_LABELS[l]:>8}" for l in range(1, 5)))
|
|
for i in range(4):
|
|
row_total = conf[i].sum()
|
|
row = f" {SPEC_LABELS[i+1]:>8}"
|
|
for j in range(4):
|
|
row += f"{conf[i][j]:>8}"
|
|
print(row + f" | {row_total}")
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 5. CATEGORY DISAGREEMENT PATTERNS
|
|
# ═══════════════════════════════════════════════════════════
|
|
if ref_data:
|
|
print("\n" + "═" * 70)
|
|
print("TOP CATEGORY DISAGREEMENT PATTERNS vs REFERENCE")
|
|
print("═" * 70)
|
|
|
|
for m in model_names:
|
|
if m == REFERENCE_MODEL:
|
|
continue
|
|
display = MODEL_DISPLAY.get(m, m)
|
|
common = set(models[m].keys()) & set(ref_data.keys())
|
|
if len(common) < 100:
|
|
continue
|
|
|
|
disagreements: Counter = Counter()
|
|
for pid in common:
|
|
rl = get_label(REFERENCE_MODEL, pid)
|
|
ml = get_label(m, pid)
|
|
if not rl or not ml:
|
|
continue
|
|
rc = CAT_SHORT[rl["content_category"]]
|
|
mc = CAT_SHORT[ml["content_category"]]
|
|
if rc != mc:
|
|
disagreements[(rc, mc)] += 1
|
|
|
|
total_disagree = sum(disagreements.values())
|
|
if total_disagree == 0:
|
|
continue
|
|
|
|
print(f"\n {display}: {total_disagree} disagreements ({total_disagree/len(common)*100:.1f}%)")
|
|
for (rc, mc), count in disagreements.most_common(5):
|
|
print(f" {rc} → {mc}: {count}")
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 6. SPECIFIC_FACTS COVERAGE
|
|
# ═══════════════════════════════════════════════════════════
|
|
print("\n" + "═" * 70)
|
|
print("SPECIFIC_FACTS COVERAGE")
|
|
print("═" * 70)
|
|
|
|
header = f"{'Model':<16}{'Has facts':>10}{'Avg #':>8}{'L1 empty':>10}{'L2+ has':>10}"
|
|
print(header)
|
|
print("─" * len(header))
|
|
|
|
for m in model_names:
|
|
display = MODEL_DISPLAY.get(m, m)[:15]
|
|
has_facts = 0
|
|
total_facts = 0
|
|
l1_empty = 0
|
|
l1_total = 0
|
|
l2plus_has = 0
|
|
l2plus_total = 0
|
|
|
|
for pid in models[m]:
|
|
l = get_label(m, pid)
|
|
if not l:
|
|
continue
|
|
facts = l.get("specific_facts") or []
|
|
spec = l["specificity_level"]
|
|
|
|
if facts:
|
|
has_facts += 1
|
|
total_facts += len(facts)
|
|
|
|
if spec == 1:
|
|
l1_total += 1
|
|
if not facts:
|
|
l1_empty += 1
|
|
else:
|
|
l2plus_total += 1
|
|
if facts:
|
|
l2plus_has += 1
|
|
|
|
total = len(models[m])
|
|
print(f"{display:<16}"
|
|
f"{has_facts/total*100:>9.1f}%"
|
|
f"{total_facts/max(1,has_facts):>8.1f}"
|
|
f"{l1_empty/max(1,l1_total)*100:>9.1f}%"
|
|
f"{l2plus_has/max(1,l2plus_total)*100:>9.1f}%")
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 7. MULTI-MODEL CONSENSUS ANALYSIS
|
|
# ═══════════════════════════════════════════════════════════
|
|
print("\n" + "═" * 70)
|
|
print("MULTI-MODEL CONSENSUS")
|
|
print("═" * 70)
|
|
|
|
# For paragraphs common to all models
|
|
if len(all_pids) >= 100:
|
|
cat_unanimous = 0
|
|
spec_unanimous = 0
|
|
both_unanimous = 0
|
|
cat_majority = 0
|
|
spec_majority = 0
|
|
|
|
for pid in all_pids:
|
|
cats = []
|
|
specs = []
|
|
for m in model_names:
|
|
l = get_label(m, pid)
|
|
if l:
|
|
cats.append(l["content_category"])
|
|
specs.append(l["specificity_level"])
|
|
|
|
cat_counts = Counter(cats)
|
|
spec_counts = Counter(specs)
|
|
top_cat_n = cat_counts.most_common(1)[0][1]
|
|
top_spec_n = spec_counts.most_common(1)[0][1]
|
|
|
|
if len(set(cats)) == 1:
|
|
cat_unanimous += 1
|
|
if len(set(specs)) == 1:
|
|
spec_unanimous += 1
|
|
if len(set(cats)) == 1 and len(set(specs)) == 1:
|
|
both_unanimous += 1
|
|
if top_cat_n > len(cats) / 2:
|
|
cat_majority += 1
|
|
if top_spec_n > len(specs) / 2:
|
|
spec_majority += 1
|
|
|
|
n = len(all_pids)
|
|
print(f" Category unanimous: {cat_unanimous}/{n} ({cat_unanimous/n*100:.1f}%)")
|
|
print(f" Category majority: {cat_majority}/{n} ({cat_majority/n*100:.1f}%)")
|
|
print(f" Specificity unanimous: {spec_unanimous}/{n} ({spec_unanimous/n*100:.1f}%)")
|
|
print(f" Specificity majority: {spec_majority}/{n} ({spec_majority/n*100:.1f}%)")
|
|
print(f" Both unanimous: {both_unanimous}/{n} ({both_unanimous/n*100:.1f}%)")
|
|
else:
|
|
print(f" Only {len(all_pids)} common paragraphs — skipping full consensus")
|
|
print(" (Some models may still be running)")
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# 8. COST SUMMARY
|
|
# ═══════════════════════════════════════════════════════════
|
|
print("\n" + "═" * 70)
|
|
print("COST & LATENCY SUMMARY")
|
|
print("═" * 70)
|
|
|
|
header = f"{'Model':<16}{'Cost':>10}{'Avg ms':>10}{'Tokens/p':>10}{'Reason/p':>10}"
|
|
print(header)
|
|
print("─" * len(header))
|
|
|
|
total_cost = 0
|
|
for m in model_names:
|
|
display = MODEL_DISPLAY.get(m, m)[:15]
|
|
costs = []
|
|
latencies = []
|
|
reasoning = []
|
|
for pid in models[m]:
|
|
ann = models[m][pid]
|
|
prov = ann.get("provenance", {})
|
|
costs.append(prov.get("costUsd", 0))
|
|
latencies.append(prov.get("latencyMs", 0))
|
|
reasoning.append(prov.get("reasoningTokens", 0))
|
|
|
|
cost = sum(costs)
|
|
total_cost += cost
|
|
avg_lat = np.mean(latencies) if latencies else 0
|
|
avg_tok = np.mean([c.get("provenance", {}).get("inputTokens", 0) + c.get("provenance", {}).get("outputTokens", 0) for c in models[m].values()])
|
|
avg_reason = np.mean(reasoning) if reasoning else 0
|
|
|
|
print(f"{display:<16}${cost:>9.4f}{avg_lat:>9.0f}ms{avg_tok:>10.0f}{avg_reason:>10.0f}")
|
|
|
|
print(f"\n Total benchmark cost: ${total_cost:.4f}")
|
|
|
|
print("\n" + "═" * 70)
|
|
print("DONE")
|
|
print("═" * 70)
|