""" V2 holdout benchmark analysis. Compares all models in data/annotations/v2-bench/ on the 1,200 v2 holdout. Uses GPT-5.4 (v4.5) as reference since it's our best-validated model. Outputs: - Per-model distribution tables (category + specificity) - Pairwise agreement matrix (category, specificity, both) - Per-model agreement with GPT-5.4 reference - Confusion patterns: where models disagree and why - Confidence distribution per model - Specific facts coverage analysis """ import json import sys from collections import Counter, defaultdict from itertools import combinations from pathlib import Path import numpy as np ROOT = Path(__file__).resolve().parent.parent V2_BENCH = ROOT / "data/annotations/v2-bench" GOLDEN_DIR = ROOT / "data/annotations/golden" CATEGORIES = [ "Board Governance", "Management Role", "Risk Management Process", "Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other", ] CAT_SHORT = {"Board Governance": "BG", "Management Role": "MR", "Risk Management Process": "RMP", "Third-Party Risk": "TP", "Incident Disclosure": "ID", "Strategy Integration": "SI", "None/Other": "N/O"} SPEC_LABELS = {1: "L1", 2: "L2", 3: "L3", 4: "L4"} MODEL_DISPLAY = { "gemini-3.1-flash-lite-preview": "Gemini Lite", "mimo-v2-flash": "MIMO Flash", "grok-4.1-fast": "Grok Fast", "gpt-5.4": "GPT-5.4", "kimi-k2.5": "Kimi K2.5", "gemini-3.1-pro-preview": "Gemini Pro", "glm-5": "GLM-5", "minimax-m2.7": "MiniMax M2.7", "mimo-v2-pro": "MIMO Pro", } REFERENCE_MODEL = "gpt-5.4" def load_jsonl(path: Path) -> list[dict]: records = [] with open(path) as f: for line in f: line = line.strip() if line: records.append(json.loads(line)) return records def cohens_kappa(a: list, b: list) -> float: assert len(a) == len(b) n = len(a) if n == 0: return 0.0 labels = sorted(set(a) | set(b)) idx = {l: i for i, l in enumerate(labels)} k = len(labels) conf = np.zeros((k, k)) for x, y in zip(a, b): conf[idx[x]][idx[y]] += 1 po = np.trace(conf) / n pe = sum((conf[i, :].sum() / n) * (conf[:, i].sum() / n) for i in range(k)) if pe >= 1.0: return 1.0 return (po - pe) / (1 - pe) def weighted_kappa(a: list[int], b: list[int]) -> float: """Quadratic-weighted kappa for ordinal specificity.""" assert len(a) == len(b) n = len(a) if n == 0: return 0.0 labels = sorted(set(a) | set(b)) idx = {l: i for i, l in enumerate(labels)} k = len(labels) conf = np.zeros((k, k)) for x, y in zip(a, b): conf[idx[x]][idx[y]] += 1 weights = np.zeros((k, k)) for i in range(k): for j in range(k): weights[i][j] = (i - j) ** 2 / (k - 1) ** 2 po = 1 - np.sum(weights * conf) / n expected = np.outer(conf.sum(axis=1), conf.sum(axis=0)) / n pe = 1 - np.sum(weights * expected) / n if pe == 0: return 1.0 return (po - pe) / (1 - pe) # ── Load all models ── print("Loading v2-bench annotations...") models: dict[str, dict[str, dict]] = {} # model_short -> {pid -> annotation} for f in sorted(V2_BENCH.glob("*.jsonl")): if "errors" in f.name or f.stem.startswith("gpt-5.4.v4"): continue records = load_jsonl(f) if len(records) < 100: print(f" SKIP {f.name}: only {len(records)} records") continue model_short = f.stem by_pid = {r["paragraphId"]: r for r in records} models[model_short] = by_pid display = MODEL_DISPLAY.get(model_short, model_short) print(f" {display}: {len(by_pid)} annotations") # Load Opus golden if available opus_path = GOLDEN_DIR / "opus.jsonl" if opus_path.exists(): records = load_jsonl(opus_path) if len(records) >= 100: by_pid = {r["paragraphId"]: r for r in records} models["opus-4.6"] = by_pid MODEL_DISPLAY["opus-4.6"] = "Opus 4.6" print(f" Opus 4.6: {len(by_pid)} annotations") # Common paragraph IDs across all models all_pids = set.intersection(*(set(m.keys()) for m in models.values())) if models else set() print(f"\n {len(all_pids)} paragraphs common to all {len(models)} models") if not all_pids: # Fall back to pairwise with reference ref = models.get(REFERENCE_MODEL) if ref: all_pids = set(ref.keys()) print(f" Using {len(all_pids)} reference model paragraphs for pairwise analysis") model_names = sorted(models.keys(), key=lambda m: list(MODEL_DISPLAY.keys()).index(m) if m in MODEL_DISPLAY else 999) def get_label(model: str, pid: str) -> dict | None: ann = models.get(model, {}).get(pid) if not ann: return None return ann.get("label", ann) # ═══════════════════════════════════════════════════════════ # 1. DISTRIBUTION TABLES # ═══════════════════════════════════════════════════════════ print("\n" + "═" * 70) print("CATEGORY DISTRIBUTION") print("═" * 70) header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in CAT_SHORT.values()) print(header) print("─" * len(header)) for m in model_names: display = MODEL_DISPLAY.get(m, m)[:15] cats = [get_label(m, pid) for pid in models[m]] cats = [l["content_category"] for l in cats if l] counts = Counter(cats) total = len(cats) row = f"{display:<16}" for full_name in CATEGORIES: pct = counts.get(full_name, 0) / total * 100 if total else 0 row += f"{pct:>7.1f}%" print(row) print("\n" + "═" * 70) print("SPECIFICITY DISTRIBUTION") print("═" * 70) header = f"{'Model':<16}" + "".join(f"{s:>8}" for s in SPEC_LABELS.values()) + f"{'Med%':>8}" print(header) print("─" * len(header)) for m in model_names: display = MODEL_DISPLAY.get(m, m)[:15] labels = [get_label(m, pid) for pid in models[m]] specs = [l["specificity_level"] for l in labels if l] confs = [l.get("specificity_confidence", "high") for l in labels if l] counts = Counter(specs) total = len(specs) med_count = sum(1 for c in confs if c == "medium") row = f"{display:<16}" for level in SPEC_LABELS: pct = counts.get(level, 0) / total * 100 if total else 0 row += f"{pct:>7.1f}%" med_pct = med_count / total * 100 if total else 0 row += f"{med_pct:>7.1f}%" print(row) # ═══════════════════════════════════════════════════════════ # 2. AGREEMENT WITH REFERENCE # ═══════════════════════════════════════════════════════════ ref_data = models.get(REFERENCE_MODEL) if ref_data: print("\n" + "═" * 70) print(f"AGREEMENT WITH {MODEL_DISPLAY.get(REFERENCE_MODEL, REFERENCE_MODEL).upper()}") print("═" * 70) header = f"{'Model':<16}{'Cat%':>8}{'Cat κ':>8}{'Spec%':>8}{'Spec κw':>8}{'Both%':>8}{'N':>6}" print(header) print("─" * len(header)) for m in model_names: if m == REFERENCE_MODEL: continue display = MODEL_DISPLAY.get(m, m)[:15] common = set(models[m].keys()) & set(ref_data.keys()) if len(common) < 100: print(f"{display:<16} (only {len(common)} common paragraphs)") continue ref_cats, m_cats = [], [] ref_specs, m_specs = [], [] both_match = 0 for pid in common: rl = get_label(REFERENCE_MODEL, pid) ml = get_label(m, pid) if not rl or not ml: continue ref_cats.append(rl["content_category"]) m_cats.append(ml["content_category"]) ref_specs.append(rl["specificity_level"]) m_specs.append(ml["specificity_level"]) if rl["content_category"] == ml["content_category"] and rl["specificity_level"] == ml["specificity_level"]: both_match += 1 n = len(ref_cats) cat_agree = sum(1 for a, b in zip(ref_cats, m_cats) if a == b) / n * 100 spec_agree = sum(1 for a, b in zip(ref_specs, m_specs) if a == b) / n * 100 both_pct = both_match / n * 100 cat_k = cohens_kappa(ref_cats, m_cats) spec_kw = weighted_kappa(ref_specs, m_specs) print(f"{display:<16}{cat_agree:>7.1f}%{cat_k:>8.3f}{spec_agree:>7.1f}%{spec_kw:>8.3f}{both_pct:>7.1f}%{n:>6}") # ═══════════════════════════════════════════════════════════ # 3. PAIRWISE AGREEMENT MATRIX (category kappa) # ═══════════════════════════════════════════════════════════ print("\n" + "═" * 70) print("PAIRWISE CATEGORY κ (lower triangle)") print("═" * 70) short_names = [MODEL_DISPLAY.get(m, m)[:10] for m in model_names] header = f"{'':>12}" + "".join(f"{s:>12}" for s in short_names) print(header) for i, m1 in enumerate(model_names): row = f"{short_names[i]:>12}" for j, m2 in enumerate(model_names): if j >= i: row += f"{'—':>12}" continue common = set(models[m1].keys()) & set(models[m2].keys()) if len(common) < 100: row += f"{'n/a':>12}" continue cats1 = [get_label(m1, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)] cats2 = [get_label(m2, pid)["content_category"] for pid in common if get_label(m1, pid) and get_label(m2, pid)] k = cohens_kappa(cats1, cats2) row += f"{k:>12.3f}" print(row) # ═══════════════════════════════════════════════════════════ # 4. SPECIFICITY CONFUSION WITH REFERENCE # ═══════════════════════════════════════════════════════════ if ref_data: print("\n" + "═" * 70) print("SPECIFICITY CONFUSION vs REFERENCE (rows=model, cols=reference)") print("═" * 70) for m in model_names: if m == REFERENCE_MODEL: continue display = MODEL_DISPLAY.get(m, m) common = set(models[m].keys()) & set(ref_data.keys()) if len(common) < 100: continue conf = np.zeros((4, 4), dtype=int) for pid in common: rl = get_label(REFERENCE_MODEL, pid) ml = get_label(m, pid) if not rl or not ml: continue ref_s = rl["specificity_level"] - 1 mod_s = ml["specificity_level"] - 1 conf[mod_s][ref_s] += 1 print(f"\n {display} (N={int(conf.sum())})") print(f" {'':>8}" + "".join(f"{'ref ' + SPEC_LABELS[l]:>8}" for l in range(1, 5))) for i in range(4): row_total = conf[i].sum() row = f" {SPEC_LABELS[i+1]:>8}" for j in range(4): row += f"{conf[i][j]:>8}" print(row + f" | {row_total}") # ═══════════════════════════════════════════════════════════ # 5. CATEGORY DISAGREEMENT PATTERNS # ═══════════════════════════════════════════════════════════ if ref_data: print("\n" + "═" * 70) print("TOP CATEGORY DISAGREEMENT PATTERNS vs REFERENCE") print("═" * 70) for m in model_names: if m == REFERENCE_MODEL: continue display = MODEL_DISPLAY.get(m, m) common = set(models[m].keys()) & set(ref_data.keys()) if len(common) < 100: continue disagreements: Counter = Counter() for pid in common: rl = get_label(REFERENCE_MODEL, pid) ml = get_label(m, pid) if not rl or not ml: continue rc = CAT_SHORT[rl["content_category"]] mc = CAT_SHORT[ml["content_category"]] if rc != mc: disagreements[(rc, mc)] += 1 total_disagree = sum(disagreements.values()) if total_disagree == 0: continue print(f"\n {display}: {total_disagree} disagreements ({total_disagree/len(common)*100:.1f}%)") for (rc, mc), count in disagreements.most_common(5): print(f" {rc} → {mc}: {count}") # ═══════════════════════════════════════════════════════════ # 6. SPECIFIC_FACTS COVERAGE # ═══════════════════════════════════════════════════════════ print("\n" + "═" * 70) print("SPECIFIC_FACTS COVERAGE") print("═" * 70) header = f"{'Model':<16}{'Has facts':>10}{'Avg #':>8}{'L1 empty':>10}{'L2+ has':>10}" print(header) print("─" * len(header)) for m in model_names: display = MODEL_DISPLAY.get(m, m)[:15] has_facts = 0 total_facts = 0 l1_empty = 0 l1_total = 0 l2plus_has = 0 l2plus_total = 0 for pid in models[m]: l = get_label(m, pid) if not l: continue facts = l.get("specific_facts") or [] spec = l["specificity_level"] if facts: has_facts += 1 total_facts += len(facts) if spec == 1: l1_total += 1 if not facts: l1_empty += 1 else: l2plus_total += 1 if facts: l2plus_has += 1 total = len(models[m]) print(f"{display:<16}" f"{has_facts/total*100:>9.1f}%" f"{total_facts/max(1,has_facts):>8.1f}" f"{l1_empty/max(1,l1_total)*100:>9.1f}%" f"{l2plus_has/max(1,l2plus_total)*100:>9.1f}%") # ═══════════════════════════════════════════════════════════ # 7. MULTI-MODEL CONSENSUS ANALYSIS # ═══════════════════════════════════════════════════════════ print("\n" + "═" * 70) print("MULTI-MODEL CONSENSUS") print("═" * 70) # For paragraphs common to all models if len(all_pids) >= 100: cat_unanimous = 0 spec_unanimous = 0 both_unanimous = 0 cat_majority = 0 spec_majority = 0 for pid in all_pids: cats = [] specs = [] for m in model_names: l = get_label(m, pid) if l: cats.append(l["content_category"]) specs.append(l["specificity_level"]) cat_counts = Counter(cats) spec_counts = Counter(specs) top_cat_n = cat_counts.most_common(1)[0][1] top_spec_n = spec_counts.most_common(1)[0][1] if len(set(cats)) == 1: cat_unanimous += 1 if len(set(specs)) == 1: spec_unanimous += 1 if len(set(cats)) == 1 and len(set(specs)) == 1: both_unanimous += 1 if top_cat_n > len(cats) / 2: cat_majority += 1 if top_spec_n > len(specs) / 2: spec_majority += 1 n = len(all_pids) print(f" Category unanimous: {cat_unanimous}/{n} ({cat_unanimous/n*100:.1f}%)") print(f" Category majority: {cat_majority}/{n} ({cat_majority/n*100:.1f}%)") print(f" Specificity unanimous: {spec_unanimous}/{n} ({spec_unanimous/n*100:.1f}%)") print(f" Specificity majority: {spec_majority}/{n} ({spec_majority/n*100:.1f}%)") print(f" Both unanimous: {both_unanimous}/{n} ({both_unanimous/n*100:.1f}%)") else: print(f" Only {len(all_pids)} common paragraphs — skipping full consensus") print(" (Some models may still be running)") # ═══════════════════════════════════════════════════════════ # 8. COST SUMMARY # ═══════════════════════════════════════════════════════════ print("\n" + "═" * 70) print("COST & LATENCY SUMMARY") print("═" * 70) header = f"{'Model':<16}{'Cost':>10}{'Avg ms':>10}{'Tokens/p':>10}{'Reason/p':>10}" print(header) print("─" * len(header)) total_cost = 0 for m in model_names: display = MODEL_DISPLAY.get(m, m)[:15] costs = [] latencies = [] reasoning = [] for pid in models[m]: ann = models[m][pid] prov = ann.get("provenance", {}) costs.append(prov.get("costUsd", 0)) latencies.append(prov.get("latencyMs", 0)) reasoning.append(prov.get("reasoningTokens", 0)) cost = sum(costs) total_cost += cost avg_lat = np.mean(latencies) if latencies else 0 avg_tok = np.mean([c.get("provenance", {}).get("inputTokens", 0) + c.get("provenance", {}).get("outputTokens", 0) for c in models[m].values()]) avg_reason = np.mean(reasoning) if reasoning else 0 print(f"{display:<16}${cost:>9.4f}{avg_lat:>9.0f}ms{avg_tok:>10.0f}{avg_reason:>10.0f}") print(f"\n Total benchmark cost: ${total_cost:.4f}") print("\n" + "═" * 70) print("DONE") print("═" * 70)