SEC-cyBERT/scripts/analyze-gold.py
2026-04-02 09:28:44 -04:00

2199 lines
96 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Comprehensive 13-signal analysis of gold set holdout.
Sources (per paragraph):
3 human annotators (BIBD)
3 Stage 1 panel (gemini-flash-lite, mimo-v2-flash, grok-fast) — v2.5
1 Opus 4.6 golden — v3.0+codebook
6 benchmark models (gpt-5.4, kimi-k2.5, gemini-pro, glm-5, minimax-m2.7, mimo-pro) — v3.0
Outputs ~30 charts to data/gold/charts/ and detailed textual analysis to stdout.
"""
import json
import os
from collections import Counter, defaultdict
from itertools import combinations
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
# ── Paths ──
ROOT = Path("/home/joey/Documents/sec-cyBERT")
GOLD_DIR = ROOT / "data/gold"
CHART_DIR = GOLD_DIR / "charts"
STAGE1_PATH = ROOT / "data/annotations/stage1.patched.jsonl"
OPUS_PATH = ROOT / "data/annotations/golden/opus.jsonl"
BENCH_DIR = ROOT / "data/annotations/bench-holdout"
LABELS_PATH = GOLD_DIR / "human-labels-raw.jsonl"
METRICS_PATH = GOLD_DIR / "metrics.json"
CATEGORIES = [
"Board Governance", "Management Role", "Risk Management Process",
"Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
]
CAT_SHORT = ["BG", "MR", "RMP", "TPR", "ID", "SI", "N/O"]
CAT_MAP = dict(zip(CATEGORIES, CAT_SHORT))
CAT_IDX = {c: i for i, c in enumerate(CATEGORIES)}
SPEC_LEVELS = [1, 2, 3, 4]
CHART_DIR.mkdir(parents=True, exist_ok=True)
# ── Shared style ──
plt.rcParams.update({
"figure.facecolor": "white",
"axes.facecolor": "#fafafa",
"axes.grid": True,
"grid.alpha": 0.3,
"font.size": 10,
})
# Short display names for models
MODEL_SHORT = {
"google/gemini-3.1-flash-lite-preview": "Gemini Lite",
"x-ai/grok-4.1-fast": "Grok Fast",
"xiaomi/mimo-v2-flash": "MIMO Flash",
"anthropic/claude-opus-4-6": "Opus 4.6",
"openai/gpt-5.4": "GPT-5.4",
"moonshotai/kimi-k2.5": "Kimi K2.5",
"google/gemini-3.1-pro-preview": "Gemini Pro",
"z-ai/glm-5": "GLM-5",
"minimax/minimax-m2.7": "MiniMax M2.7",
"xiaomi/mimo-v2-pro": "MIMO Pro",
}
MODEL_TIER = {
"google/gemini-3.1-flash-lite-preview": "stage1",
"x-ai/grok-4.1-fast": "stage1",
"xiaomi/mimo-v2-flash": "stage1",
"anthropic/claude-opus-4-6": "frontier",
"openai/gpt-5.4": "frontier",
"moonshotai/kimi-k2.5": "frontier",
"google/gemini-3.1-pro-preview": "frontier",
"z-ai/glm-5": "mid",
"minimax/minimax-m2.7": "budget",
"xiaomi/mimo-v2-pro": "mid",
}
TIER_COLORS = {
"stage1": "#95a5a6",
"frontier": "#e74c3c",
"mid": "#f39c12",
"budget": "#27ae60",
}
def load_jsonl(path: Path) -> list[dict]:
records = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
return records
def majority_vote(items: list) -> object | None:
if not items:
return None
c = Counter(items)
top, count = c.most_common(1)[0]
return top if count > len(items) / 2 else None
def plurality_vote(items: list) -> tuple:
c = Counter(items)
return c.most_common(1)[0]
def cohens_kappa(labels_a: list, labels_b: list) -> float:
"""Compute Cohen's kappa for two lists of categorical labels."""
assert len(labels_a) == len(labels_b)
n = len(labels_a)
if n == 0:
return 0.0
all_labels = sorted(set(labels_a) | set(labels_b))
idx = {l: i for i, l in enumerate(all_labels)}
k = len(all_labels)
conf = np.zeros((k, k))
for a, b in zip(labels_a, labels_b):
conf[idx[a]][idx[b]] += 1
po = np.trace(conf) / n
pe = sum((conf[i, :].sum() / n) * (conf[:, i].sum() / n) for i in range(k))
if pe >= 1.0:
return 1.0
return (po - pe) / (1 - pe)
# ═══════════════════════════════════════════════════════════
# LOAD ALL DATA
# ═══════════════════════════════════════════════════════════
print("Loading data...")
human_labels = load_jsonl(LABELS_PATH)
holdout_ids = {l["paragraphId"] for l in human_labels}
print(f" {len(human_labels)} human labels, {len(holdout_ids)} paragraphs")
# Stage 1 annotations for holdout
stage1_by_pid: dict[str, list[dict]] = defaultdict(list)
with open(STAGE1_PATH) as f:
for line in f:
d = json.loads(line)
if d["paragraphId"] in holdout_ids:
stage1_by_pid[d["paragraphId"]].append(d)
print(f" {sum(len(v) for v in stage1_by_pid.values())} Stage 1 annotations")
# Opus
opus_by_pid: dict[str, dict] = {}
for r in load_jsonl(OPUS_PATH):
if r["paragraphId"] in holdout_ids:
opus_by_pid[r["paragraphId"]] = r
print(f" {len(opus_by_pid)} Opus annotations matched to holdout")
# Benchmark models
bench_by_model: dict[str, dict[str, dict]] = {} # model_short -> {pid -> annotation}
bench_files = sorted(BENCH_DIR.glob("*.jsonl"))
for bf in bench_files:
if "errors" in bf.name:
continue
records = load_jsonl(bf)
if len(records) < 100:
continue # skip partial runs (deepseek-r1 has 1 annotation)
model_id = records[0]["provenance"]["modelId"]
short = MODEL_SHORT.get(model_id, model_id.split("/")[-1])
by_pid = {}
for r in records:
if r["paragraphId"] in holdout_ids:
by_pid[r["paragraphId"]] = r
bench_by_model[short] = by_pid
print(f" {short}: {len(by_pid)} annotations")
# Human labels grouped
human_by_pid: dict[str, list[dict]] = defaultdict(list)
for l in human_labels:
human_by_pid[l["paragraphId"]].append(l)
annotator_names = sorted({l["annotatorName"] for l in human_labels})
metrics = json.loads(METRICS_PATH.read_text())
# Paragraph metadata
para_all = load_jsonl(GOLD_DIR / "paragraphs-holdout.jsonl")
para_meta = {p["id"]: p for p in para_all if p["id"] in holdout_ids}
# ═══════════════════════════════════════════════════════════
# BUILD 13-SIGNAL MATRIX
# ═══════════════════════════════════════════════════════════
print("\nBuilding 13-signal matrix...")
# For each paragraph, collect all signals
# GenAI models: 3 Stage1 + Opus + 6 bench = 10
GENAI_SOURCES = ["Gemini Lite", "Grok Fast", "MIMO Flash", "Opus 4.6"] + sorted(bench_by_model.keys())
# Deduplicate (Opus might already be in bench)
GENAI_SOURCES = list(dict.fromkeys(GENAI_SOURCES))
ALL_GENAI = GENAI_SOURCES
# Model ID to short name mapping (reverse)
MODEL_ID_TO_SHORT = {v: k for k, v in MODEL_SHORT.items()}
signals = {} # pid -> {source_name: {cat, spec}}
for pid in holdout_ids:
sig = {}
# Human labels
for lbl in human_by_pid.get(pid, []):
sig[f"H:{lbl['annotatorName']}"] = {
"cat": lbl["contentCategory"],
"spec": lbl["specificityLevel"],
}
# Stage 1
for a in stage1_by_pid.get(pid, []):
mid = a["provenance"]["modelId"]
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
sig[short] = {"cat": a["label"]["content_category"], "spec": a["label"]["specificity_level"]}
# Opus
if pid in opus_by_pid:
sig["Opus 4.6"] = {
"cat": opus_by_pid[pid]["label"]["content_category"],
"spec": opus_by_pid[pid]["label"]["specificity_level"],
}
# Benchmark
for model_short, by_pid_map in bench_by_model.items():
if pid in by_pid_map:
a = by_pid_map[pid]
sig[model_short] = {"cat": a["label"]["content_category"], "spec": a["label"]["specificity_level"]}
signals[pid] = sig
# Derive consensus labels
consensus = {}
for pid in holdout_ids:
sig = signals[pid]
human_cats = [s["cat"] for k, s in sig.items() if k.startswith("H:")]
human_specs = [s["spec"] for k, s in sig.items() if k.startswith("H:")]
genai_cats = [s["cat"] for k, s in sig.items() if not k.startswith("H:")]
genai_specs = [s["spec"] for k, s in sig.items() if not k.startswith("H:")]
all_cats = human_cats + genai_cats
all_specs = human_specs + genai_specs
s1_cats = [s["cat"] for k, s in sig.items() if k in ("Gemini Lite", "Grok Fast", "MIMO Flash")]
s1_specs = [s["spec"] for k, s in sig.items() if k in ("Gemini Lite", "Grok Fast", "MIMO Flash")]
consensus[pid] = {
"human_cats": human_cats,
"human_specs": human_specs,
"human_cat_maj": majority_vote(human_cats),
"human_spec_maj": majority_vote([str(s) for s in human_specs]),
"human_cat_unanimous": len(set(human_cats)) == 1,
"human_spec_unanimous": len(set(human_specs)) == 1,
"s1_cats": s1_cats,
"s1_specs": s1_specs,
"s1_cat_maj": majority_vote(s1_cats),
"s1_spec_maj": majority_vote([str(s) for s in s1_specs]),
"genai_cats": genai_cats,
"genai_specs": genai_specs,
"genai_cat_maj": majority_vote(genai_cats),
"genai_spec_maj": majority_vote([str(s) for s in genai_specs]),
"all_cats": all_cats,
"all_specs": all_specs,
"all_cat_counts": Counter(all_cats),
"all_spec_counts": Counter(all_specs),
"n_signals": len(all_cats),
"opus_cat": sig.get("Opus 4.6", {}).get("cat"),
"opus_spec": sig.get("Opus 4.6", {}).get("spec"),
"word_count": para_meta.get(pid, {}).get("wordCount", 0),
"signals": sig,
}
# Fix human_spec_maj back to int
hsm = consensus[pid]["human_spec_maj"]
consensus[pid]["human_spec_maj"] = int(hsm) if hsm else None
ssm = consensus[pid]["s1_spec_maj"]
consensus[pid]["s1_spec_maj"] = int(ssm) if ssm else None
gsm = consensus[pid]["genai_spec_maj"]
consensus[pid]["genai_spec_maj"] = int(gsm) if gsm else None
# ═══════════════════════════════════════════════════════════
# ADJUDICATION TIERS
# ═══════════════════════════════════════════════════════════
tiers = {1: [], 2: [], 3: [], 4: []}
for pid, c in consensus.items():
n = c["n_signals"]
top_cat, top_cat_n = c["all_cat_counts"].most_common(1)[0]
top_spec, top_spec_n = Counter(c["all_specs"]).most_common(1)[0]
hm_cat = c["human_cat_maj"]
gm_cat = c["genai_cat_maj"]
hm_spec = c["human_spec_maj"]
gm_spec = c["genai_spec_maj"]
# Tier 1: 10+/13 agree on BOTH dimensions
if top_cat_n >= 10 and top_spec_n >= 10:
tiers[1].append(pid)
# Tier 2: human majority + genai majority agree on category
elif hm_cat and gm_cat and hm_cat == gm_cat:
tiers[2].append(pid)
# Tier 3: humans split, genai converges
elif hm_cat is None and gm_cat:
tiers[3].append(pid)
# Tier 4: everything else
else:
tiers[4].append(pid)
print(f"\nAdjudication tiers:")
for t in range(1, 5):
print(f" Tier {t}: {len(tiers[t])} paragraphs ({len(tiers[t])/12:.1f}%)")
# ═══════════════════════════════════════════════════════════
# CHART 01: Pairwise Kappa Heatmaps (human annotators)
# ═══════════════════════════════════════════════════════════
def plot_kappa_heatmaps():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5.5))
for ax, dim_key, title in [
(ax1, "category", "Category"),
(ax2, "specificity", "Specificity"),
]:
data = metrics["pairwiseKappa"][dim_key]
names = data["annotators"]
matrix = np.array(data["matrix"])
mask = np.eye(len(names), dtype=bool)
display = np.where(mask, np.nan, matrix)
im = ax.imshow(display, cmap="RdYlGn", vmin=0, vmax=1, aspect="equal")
ax.set_xticks(range(len(names)))
ax.set_xticklabels(names, rotation=45, ha="right", fontsize=9)
ax.set_yticks(range(len(names)))
ax.set_yticklabels(names, fontsize=9)
ax.set_title(f"Pairwise Cohen's κ — {title}", fontsize=12, fontweight="bold")
for i in range(len(names)):
for j in range(len(names)):
if i != j:
color = "white" if matrix[i][j] < 0.4 else "black"
ax.text(j, i, f"{matrix[i][j]:.2f}", ha="center", va="center",
fontsize=8, color=color)
fig.colorbar(im, ax=[ax1, ax2], shrink=0.8, label="Cohen's κ")
fig.tight_layout()
fig.savefig(CHART_DIR / "01_kappa_heatmaps.png", dpi=150)
plt.close(fig)
print(" 01_kappa_heatmaps.png")
# ═══════════════════════════════════════════════════════════
# CHART 02: Per-source category distribution (all 13 sources)
# ═══════════════════════════════════════════════════════════
def plot_all_source_category_dist():
fig, ax = plt.subplots(figsize=(18, 7))
sources = annotator_names + ["Human Maj", "S1 Maj"] + sorted(ALL_GENAI)
dist = {s: Counter() for s in sources}
for l in human_labels:
dist[l["annotatorName"]][l["contentCategory"]] += 1
for c in consensus.values():
if c["human_cat_maj"]:
dist["Human Maj"][c["human_cat_maj"]] += 1
if c["s1_cat_maj"]:
dist["S1 Maj"][c["s1_cat_maj"]] += 1
for pid, c in consensus.items():
for src, sig in c["signals"].items():
if not src.startswith("H:"):
dist[src][sig["cat"]] += 1
x = np.arange(len(sources))
width = 0.11
offsets = np.arange(len(CATEGORIES)) - len(CATEGORIES) / 2 + 0.5
colors = plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES)))
for i, (cat, color) in enumerate(zip(CATEGORIES, colors)):
counts = [dist[s].get(cat, 0) for s in sources]
totals = [sum(dist[s].values()) or 1 for s in sources]
pcts = [c / t * 100 for c, t in zip(counts, totals)]
ax.bar(x + offsets[i] * width, pcts, width, label=CAT_MAP[cat], color=color)
ax.set_xticks(x)
ax.set_xticklabels(sources, rotation=60, ha="right", fontsize=8)
ax.set_ylabel("% of labels")
ax.set_title("Category Distribution — All Sources (Humans + 10 GenAI Models)", fontweight="bold")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
fig.tight_layout()
fig.savefig(CHART_DIR / "02_category_distribution_all.png", dpi=150)
plt.close(fig)
print(" 02_category_distribution_all.png")
# ═══════════════════════════════════════════════════════════
# CHART 03: Per-source specificity distribution
# ═══════════════════════════════════════════════════════════
def plot_all_source_spec_dist():
fig, ax = plt.subplots(figsize=(18, 6))
sources = annotator_names + ["Human Maj", "S1 Maj"] + sorted(ALL_GENAI)
dist = {s: Counter() for s in sources}
for l in human_labels:
dist[l["annotatorName"]][l["specificityLevel"]] += 1
for c in consensus.values():
hm = c["human_spec_maj"]
if hm is not None:
dist["Human Maj"][hm] += 1
sm = c["s1_spec_maj"]
if sm is not None:
dist["S1 Maj"][sm] += 1
for pid, c in consensus.items():
for src, sig in c["signals"].items():
if not src.startswith("H:"):
dist[src][sig["spec"]] += 1
x = np.arange(len(sources))
width = 0.18
colors = ["#e74c3c", "#f39c12", "#2ecc71", "#3498db"]
spec_labels = ["1 Generic", "2 Sector", "3 Firm-Specific", "4 Quantified"]
for i, (level, color, label) in enumerate(zip(SPEC_LEVELS, colors, spec_labels)):
counts = [dist[s].get(level, 0) for s in sources]
totals = [sum(dist[s].values()) or 1 for s in sources]
pcts = [c / t * 100 for c, t in zip(counts, totals)]
ax.bar(x + (i - 1.5) * width, pcts, width, label=label, color=color)
ax.set_xticks(x)
ax.set_xticklabels(sources, rotation=60, ha="right", fontsize=8)
ax.set_ylabel("% of labels")
ax.set_title("Specificity Distribution — All Sources", fontweight="bold")
ax.legend()
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
fig.tight_layout()
fig.savefig(CHART_DIR / "03_specificity_distribution_all.png", dpi=150)
plt.close(fig)
print(" 03_specificity_distribution_all.png")
# ═══════════════════════════════════════════════════════════
# CHART 04: Human confusion matrices (category + specificity)
# ═══════════════════════════════════════════════════════════
def plot_human_confusion():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
cat_conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
for pid, lbls in human_by_pid.items():
cats = [l["contentCategory"] for l in lbls]
for i in range(len(cats)):
for j in range(i + 1, len(cats)):
a, b = CAT_IDX[cats[i]], CAT_IDX[cats[j]]
cat_conf[a][b] += 1
cat_conf[b][a] += 1
row_sums = cat_conf.sum(axis=1, keepdims=True)
cat_conf_norm = np.where(row_sums > 0, cat_conf / row_sums * 100, 0)
im1 = ax1.imshow(cat_conf_norm, cmap="YlOrRd", aspect="equal")
ax1.set_xticks(range(len(CAT_SHORT)))
ax1.set_xticklabels(CAT_SHORT, fontsize=9)
ax1.set_yticks(range(len(CAT_SHORT)))
ax1.set_yticklabels(CAT_SHORT, fontsize=9)
ax1.set_title("Human Category Confusion (row-norm %)", fontweight="bold")
for i in range(len(CAT_SHORT)):
for j in range(len(CAT_SHORT)):
val = cat_conf_norm[i][j]
if val > 0.5:
color = "white" if val > 40 else "black"
ax1.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=7, color=color)
spec_conf = np.zeros((4, 4))
for pid, lbls in human_by_pid.items():
specs = [l["specificityLevel"] for l in lbls]
for i in range(len(specs)):
for j in range(i + 1, len(specs)):
a, b = specs[i] - 1, specs[j] - 1
spec_conf[a][b] += 1
spec_conf[b][a] += 1
row_sums = spec_conf.sum(axis=1, keepdims=True)
spec_conf_norm = np.where(row_sums > 0, spec_conf / row_sums * 100, 0)
im2 = ax2.imshow(spec_conf_norm, cmap="YlOrRd", aspect="equal")
ax2.set_xticks(range(4))
ax2.set_xticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
ax2.set_yticks(range(4))
ax2.set_yticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
ax2.set_title("Human Specificity Confusion (row-norm %)", fontweight="bold")
for i in range(4):
for j in range(4):
val = spec_conf_norm[i][j]
if val > 0.5:
color = "white" if val > 40 else "black"
ax2.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=9, color=color)
fig.colorbar(im1, ax=ax1, shrink=0.8)
fig.colorbar(im2, ax=ax2, shrink=0.8)
fig.tight_layout()
fig.savefig(CHART_DIR / "04_human_confusion.png", dpi=150)
plt.close(fig)
print(" 04_human_confusion.png")
# ═══════════════════════════════════════════════════════════
# CHART 05: GenAI Model Agreement Matrix (10×10 pairwise kappa)
# ═══════════════════════════════════════════════════════════
def plot_genai_agreement_matrix():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
models = sorted(ALL_GENAI)
n = len(models)
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "spec", "Specificity")]:
matrix = np.eye(n)
for i, m1 in enumerate(models):
for j, m2 in enumerate(models):
if i >= j:
continue
labels_a, labels_b = [], []
for pid, c in consensus.items():
sig = c["signals"]
if m1 in sig and m2 in sig:
labels_a.append(str(sig[m1][dim]))
labels_b.append(str(sig[m2][dim]))
if len(labels_a) >= 100:
k = cohens_kappa(labels_a, labels_b)
matrix[i][j] = k
matrix[j][i] = k
mask = np.eye(n, dtype=bool)
display = np.where(mask, np.nan, matrix)
im = ax.imshow(display, cmap="RdYlGn", vmin=0.2, vmax=1, aspect="equal")
ax.set_xticks(range(n))
ax.set_xticklabels(models, rotation=60, ha="right", fontsize=7)
ax.set_yticks(range(n))
ax.set_yticklabels(models, fontsize=7)
ax.set_title(f"GenAI Pairwise κ — {title}", fontweight="bold")
for i in range(n):
for j in range(n):
if i != j:
val = matrix[i][j]
color = "white" if val < 0.5 else "black"
ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=6, color=color)
fig.colorbar(im, ax=[ax1, ax2], shrink=0.7, label="Cohen's κ")
fig.tight_layout()
fig.savefig(CHART_DIR / "05_genai_agreement_matrix.png", dpi=150)
plt.close(fig)
print(" 05_genai_agreement_matrix.png")
# ═══════════════════════════════════════════════════════════
# CHART 06: Cross-source confusion (Human vs Stage1, Human vs Opus, Human vs GenAI consensus)
# ═══════════════════════════════════════════════════════════
def plot_cross_source_confusion():
comparisons = [
("Human Maj", "S1 Maj", "human_cat_maj", "s1_cat_maj"),
("Human Maj", "Opus 4.6", "human_cat_maj", "opus_cat"),
("Human Maj", "GenAI Maj", "human_cat_maj", "genai_cat_maj"),
]
fig, axes = plt.subplots(1, 3, figsize=(21, 5.5))
for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
total, agree = 0, 0
for pid, c in consensus.items():
a_val = c[key_a]
b_val = c[key_b]
if a_val and b_val:
conf[CAT_IDX[a_val]][CAT_IDX[b_val]] += 1
total += 1
if a_val == b_val:
agree += 1
row_sums = conf.sum(axis=1, keepdims=True)
conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)
im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
ax.set_xticks(range(len(CAT_SHORT)))
ax.set_xticklabels(CAT_SHORT, fontsize=8)
ax.set_yticks(range(len(CAT_SHORT)))
ax.set_yticklabels(CAT_SHORT, fontsize=8)
pct = agree / total * 100 if total > 0 else 0
ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})",
fontweight="bold", fontsize=10)
ax.set_ylabel(name_a)
ax.set_xlabel(name_b)
for i in range(len(CAT_SHORT)):
for j in range(len(CAT_SHORT)):
val = conf_norm[i][j]
if val > 0.5:
color = "white" if val > 50 else "black"
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=7, color=color)
fig.tight_layout()
fig.savefig(CHART_DIR / "06_cross_source_category.png", dpi=150)
plt.close(fig)
print(" 06_cross_source_category.png")
# ═══════════════════════════════════════════════════════════
# CHART 07: Cross-source specificity confusion
# ═══════════════════════════════════════════════════════════
def plot_cross_source_specificity():
comparisons = [
("Human Maj", "S1 Maj", "human_spec_maj", "s1_spec_maj"),
("Human Maj", "Opus", "human_spec_maj", "opus_spec"),
("Human Maj", "GenAI Maj", "human_spec_maj", "genai_spec_maj"),
]
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
conf = np.zeros((4, 4))
total, agree = 0, 0
for pid, c in consensus.items():
a_val = c[key_a]
b_val = c[key_b]
if a_val is not None and b_val is not None:
conf[int(a_val) - 1][int(b_val) - 1] += 1
total += 1
if int(a_val) == int(b_val):
agree += 1
row_sums = conf.sum(axis=1, keepdims=True)
conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)
im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
ax.set_xticks(range(4))
ax.set_xticklabels(["S1", "S2", "S3", "S4"])
ax.set_yticks(range(4))
ax.set_yticklabels(["S1", "S2", "S3", "S4"])
pct = agree / total * 100 if total > 0 else 0
ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})", fontweight="bold")
ax.set_ylabel(name_a)
ax.set_xlabel(name_b)
for i in range(4):
for j in range(4):
val = conf_norm[i][j]
if val > 0.5:
color = "white" if val > 50 else "black"
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=9, color=color)
fig.tight_layout()
fig.savefig(CHART_DIR / "07_cross_source_specificity.png", dpi=150)
plt.close(fig)
print(" 07_cross_source_specificity.png")
# ═══════════════════════════════════════════════════════════
# CHART 08: Adjudication tier breakdown
# ═══════════════════════════════════════════════════════════
def plot_adjudication_tiers():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
# Tier counts
tier_sizes = [len(tiers[t]) for t in range(1, 5)]
tier_labels = [
f"Tier 1\n10+/13 agree\n(auto)",
f"Tier 2\nHuman+GenAI\nmaj agree",
f"Tier 3\nHumans split\nGenAI converges",
f"Tier 4\nUniversal\ndisagreement",
]
tier_colors = ["#27ae60", "#3498db", "#f39c12", "#e74c3c"]
bars = ax1.bar(range(4), tier_sizes, color=tier_colors)
ax1.set_xticks(range(4))
ax1.set_xticklabels(tier_labels, fontsize=8)
ax1.set_ylabel("Paragraphs")
ax1.set_title("Adjudication Tier Distribution (1,200 paragraphs)", fontweight="bold")
for bar, n in zip(bars, tier_sizes):
pct = n / 1200 * 100
ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
f"{n}\n({pct:.1f}%)", ha="center", fontsize=10, fontweight="bold")
# Per-tier category distribution
for t in range(1, 5):
cats = Counter()
for pid in tiers[t]:
top_cat = consensus[pid]["all_cat_counts"].most_common(1)[0][0]
cats[top_cat] += 1
pcts = [cats.get(c, 0) / len(tiers[t]) * 100 if tiers[t] else 0 for c in CATEGORIES]
x = np.arange(len(CATEGORIES))
ax2.barh([f"Tier {t}" for _ in CATEGORIES], pcts, left=[sum(pcts[:i]) for i in range(len(pcts))],
color=plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES))),
label=[CAT_MAP[c] if t == 1 else "" for c in CATEGORIES])
# Manually build the stacked bar properly
ax2.clear()
tier_names = [f"Tier {t}" for t in range(1, 5)]
bottom = np.zeros(4)
colors = plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES)))
for ci, cat in enumerate(CATEGORIES):
vals = []
for t in range(1, 5):
cat_count = sum(1 for pid in tiers[t]
if consensus[pid]["all_cat_counts"].most_common(1)[0][0] == cat)
vals.append(cat_count / len(tiers[t]) * 100 if tiers[t] else 0)
ax2.barh(tier_names, vals, left=bottom, color=colors[ci], label=CAT_MAP[cat])
bottom += np.array(vals)
ax2.set_xlabel("% of paragraphs in tier")
ax2.set_title("Category Mix by Adjudication Tier", fontweight="bold")
ax2.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
ax2.set_xlim(0, 105)
fig.tight_layout()
fig.savefig(CHART_DIR / "08_adjudication_tiers.png", dpi=150)
plt.close(fig)
print(" 08_adjudication_tiers.png")
# ═══════════════════════════════════════════════════════════
# CHART 09: Per-model accuracy vs Opus (as quasi-ground-truth)
# ═══════════════════════════════════════════════════════════
def plot_model_accuracy_vs_opus():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
models = sorted(ALL_GENAI)
cat_acc = []
spec_acc = []
model_labels = []
for model in models:
agree_cat, agree_spec, total = 0, 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
total += 1
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
agree_cat += 1
if sig[model]["spec"] == sig["Opus 4.6"]["spec"]:
agree_spec += 1
if total > 0:
cat_acc.append(agree_cat / total * 100)
spec_acc.append(agree_spec / total * 100)
model_labels.append(model)
# Sort by category accuracy
order = np.argsort(cat_acc)[::-1]
cat_acc = [cat_acc[i] for i in order]
spec_acc = [spec_acc[i] for i in order]
model_labels = [model_labels[i] for i in order]
tier_c = [TIER_COLORS.get(MODEL_TIER.get(
{v: k for k, v in MODEL_SHORT.items()}.get(m, ""), ""), "#999") for m in model_labels]
x = np.arange(len(model_labels))
width = 0.35
bars1 = ax1.barh(x, cat_acc, color=tier_c, edgecolor="black", linewidth=0.5)
ax1.set_yticks(x)
ax1.set_yticklabels(model_labels, fontsize=8)
ax1.set_xlabel("Agreement with Opus (%)")
ax1.set_title("Category Agreement with Opus 4.6", fontweight="bold")
ax1.set_xlim(60, 100)
ax1.invert_yaxis()
for bar, v in zip(bars1, cat_acc):
ax1.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
f"{v:.1f}%", va="center", fontsize=8)
bars2 = ax2.barh(x, spec_acc, color=tier_c, edgecolor="black", linewidth=0.5)
ax2.set_yticks(x)
ax2.set_yticklabels(model_labels, fontsize=8)
ax2.set_xlabel("Agreement with Opus (%)")
ax2.set_title("Specificity Agreement with Opus 4.6", fontweight="bold")
ax2.set_xlim(30, 100)
ax2.invert_yaxis()
for bar, v in zip(bars2, spec_acc):
ax2.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
f"{v:.1f}%", va="center", fontsize=8)
# Legend for tiers
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=t) for t, c in TIER_COLORS.items()]
ax1.legend(handles=legend_elements, loc="lower right", fontsize=8)
fig.tight_layout()
fig.savefig(CHART_DIR / "09_model_accuracy_vs_opus.png", dpi=150)
plt.close(fig)
print(" 09_model_accuracy_vs_opus.png")
# ═══════════════════════════════════════════════════════════
# CHART 10: Cost vs Accuracy scatter
# ═══════════════════════════════════════════════════════════
def plot_cost_vs_accuracy():
fig, ax = plt.subplots(figsize=(12, 7))
# Gather cost and accuracy data per model
model_costs: dict[str, list[float]] = defaultdict(list)
model_lats: dict[str, list[float]] = defaultdict(list)
# From bench
for bf in bench_files:
if "errors" in bf.name:
continue
records = load_jsonl(bf)
if len(records) < 100:
continue
mid = records[0]["provenance"]["modelId"]
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
for r in records:
model_costs[short].append(r["provenance"].get("costUsd", 0))
model_lats[short].append(r["provenance"].get("latencyMs", 0))
# Stage 1 costs from annotations
for pid, annots in stage1_by_pid.items():
for a in annots:
mid = a["provenance"]["modelId"]
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
model_costs[short].append(a["provenance"].get("costUsd", 0))
model_lats[short].append(a["provenance"].get("latencyMs", 0))
# Opus
for r in opus_by_pid.values():
model_costs["Opus 4.6"].append(r["provenance"].get("costUsd", 0))
model_lats["Opus 4.6"].append(r["provenance"].get("latencyMs", 0))
for model in sorted(ALL_GENAI):
costs = model_costs.get(model, [])
if not costs:
continue
avg_cost = sum(costs) / len(costs)
avg_lat = sum(model_lats.get(model, [])) / max(len(model_lats.get(model, [])), 1) / 1000 # seconds
# Category accuracy vs Opus
agree, total = 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
total += 1
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
agree += 1
cat_acc = agree / total * 100 if total > 0 else 0
mid_full = {v: k for k, v in MODEL_SHORT.items()}.get(model, "")
tier = MODEL_TIER.get(mid_full, "mid")
color = TIER_COLORS.get(tier, "#999")
ax.scatter(avg_cost * 1000, cat_acc, s=150, c=color, edgecolors="black",
linewidths=0.5, zorder=3)
ax.annotate(model, (avg_cost * 1000, cat_acc),
textcoords="offset points", xytext=(8, 4), fontsize=7)
ax.set_xlabel("Average Cost per Call (millicents, $0.001)")
ax.set_ylabel("Category Agreement with Opus (%)")
ax.set_title("Cost vs Category Accuracy (Opus as reference)", fontweight="bold")
ax.set_ylim(60, 100)
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=t) for t, c in TIER_COLORS.items()]
ax.legend(handles=legend_elements, loc="lower right")
fig.tight_layout()
fig.savefig(CHART_DIR / "10_cost_vs_accuracy.png", dpi=150)
plt.close(fig)
print(" 10_cost_vs_accuracy.png")
# ═══════════════════════════════════════════════════════════
# CHART 11: Per-category accuracy by model
# ═══════════════════════════════════════════════════════════
def plot_per_category_accuracy():
fig, ax = plt.subplots(figsize=(16, 8))
models = sorted(ALL_GENAI)
# For each model, compute accuracy vs Opus per category
data = np.zeros((len(models), len(CATEGORIES)))
for mi, model in enumerate(models):
for ci, cat in enumerate(CATEGORIES):
agree, total = 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if "Opus 4.6" in sig and model in sig and model != "Opus 4.6":
if sig["Opus 4.6"]["cat"] == cat:
total += 1
if sig[model]["cat"] == cat:
agree += 1
data[mi][ci] = agree / total * 100 if total > 0 else 0
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=50, vmax=100)
ax.set_xticks(range(len(CAT_SHORT)))
ax.set_xticklabels(CAT_SHORT, fontsize=10)
ax.set_yticks(range(len(models)))
ax.set_yticklabels(models, fontsize=8)
ax.set_title("Per-Category Recall vs Opus (%) — Where each model excels/struggles", fontweight="bold")
ax.set_xlabel("Opus label (true category)")
for i in range(len(models)):
for j in range(len(CATEGORIES)):
val = data[i][j]
color = "white" if val < 65 else "black"
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=8, color=color)
fig.colorbar(im, ax=ax, shrink=0.6, label="Recall %")
fig.tight_layout()
fig.savefig(CHART_DIR / "11_per_category_accuracy.png", dpi=150)
plt.close(fig)
print(" 11_per_category_accuracy.png")
# ═══════════════════════════════════════════════════════════
# CHART 12: Ensemble size vs accuracy (how many models needed?)
# ═══════════════════════════════════════════════════════════
def plot_ensemble_accuracy():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# For each ensemble size k (1..10), sample 100 random subsets of k GenAI models,
# take majority vote, compare to Opus
all_models = sorted(ALL_GENAI)
# Remove Opus itself from ensemble candidates
ensemble_candidates = [m for m in all_models if m != "Opus 4.6"]
rng = np.random.RandomState(42)
max_k = len(ensemble_candidates)
n_trials = 200
cat_accs_by_k = []
spec_accs_by_k = []
for k in range(1, max_k + 1):
cat_accs = []
spec_accs = []
subsets = []
if k >= max_k:
subsets = [ensemble_candidates]
else:
for _ in range(n_trials):
subsets.append(list(rng.choice(ensemble_candidates, k, replace=False)))
for subset in subsets:
agree_cat, agree_spec, total = 0, 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if "Opus 4.6" not in sig:
continue
sub_cats = [sig[m]["cat"] for m in subset if m in sig]
sub_specs = [sig[m]["spec"] for m in subset if m in sig]
if len(sub_cats) < k:
continue
total += 1
ens_cat = majority_vote(sub_cats)
ens_spec = majority_vote([str(s) for s in sub_specs])
if ens_cat == sig["Opus 4.6"]["cat"]:
agree_cat += 1
if ens_spec is not None and int(ens_spec) == sig["Opus 4.6"]["spec"]:
agree_spec += 1
if total > 0:
cat_accs.append(agree_cat / total * 100)
spec_accs.append(agree_spec / total * 100)
cat_accs_by_k.append(cat_accs)
spec_accs_by_k.append(spec_accs)
# Box plot
ks = range(1, max_k + 1)
ax1.boxplot(cat_accs_by_k, positions=list(ks), widths=0.6, patch_artist=True,
boxprops=dict(facecolor="#3498db", alpha=0.5),
medianprops=dict(color="red", linewidth=2))
ax1.set_xlabel("Ensemble size (# GenAI models)")
ax1.set_ylabel("Category agreement with Opus (%)")
ax1.set_title("Ensemble Size vs Category Accuracy", fontweight="bold")
ax1.set_xticks(list(ks))
ax1.set_xticklabels(list(ks))
ax2.boxplot(spec_accs_by_k, positions=list(ks), widths=0.6, patch_artist=True,
boxprops=dict(facecolor="#e74c3c", alpha=0.5),
medianprops=dict(color="red", linewidth=2))
ax2.set_xlabel("Ensemble size (# GenAI models)")
ax2.set_ylabel("Specificity agreement with Opus (%)")
ax2.set_title("Ensemble Size vs Specificity Accuracy", fontweight="bold")
ax2.set_xticks(list(ks))
ax2.set_xticklabels(list(ks))
fig.tight_layout()
fig.savefig(CHART_DIR / "12_ensemble_accuracy.png", dpi=150)
plt.close(fig)
print(" 12_ensemble_accuracy.png")
# ═══════════════════════════════════════════════════════════
# CHART 13: Agreement by word count (human + genai)
# ═══════════════════════════════════════════════════════════
def plot_agreement_by_wordcount():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
wc_bins = [(0, 50), (51, 80), (81, 120), (121, 180), (181, 500)]
bin_labels = ["≤50", "51-80", "81-120", "121-180", "180+"]
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "both", "Both")]:
h_rates, g_rates, ns = [], [], []
for lo, hi in wc_bins:
h_agree, g_agree, total = 0, 0, 0
for pid, c in consensus.items():
wc = c["word_count"]
if lo <= wc <= hi:
total += 1
if dim == "cat":
if c["human_cat_unanimous"]:
h_agree += 1
if len(set(c["genai_cats"])) == 1:
g_agree += 1
else:
if c["human_cat_unanimous"] and c["human_spec_unanimous"]:
h_agree += 1
if len(set(c["genai_cats"])) == 1 and len(set(c["genai_specs"])) == 1:
g_agree += 1
h_rates.append(h_agree / total * 100 if total > 0 else 0)
g_rates.append(g_agree / total * 100 if total > 0 else 0)
ns.append(total)
x = np.arange(len(bin_labels))
width = 0.35
ax.bar(x - width / 2, h_rates, width, label="Human unanimous", color="#3498db")
ax.bar(x + width / 2, g_rates, width, label="GenAI unanimous", color="#e74c3c")
ax.set_xticks(x)
ax.set_xticklabels(bin_labels)
ax.set_xlabel("Word Count")
ax.set_ylabel("Unanimous %")
ax.set_title(f"{title} Unanimity by Paragraph Length", fontweight="bold")
ax.legend()
for i, n in enumerate(ns):
ax.text(i, max(h_rates[i], g_rates[i]) + 1, f"n={n}", ha="center", fontsize=8)
fig.tight_layout()
fig.savefig(CHART_DIR / "13_agreement_by_wordcount.png", dpi=150)
plt.close(fig)
print(" 13_agreement_by_wordcount.png")
# ═══════════════════════════════════════════════════════════
# CHART 14: Time vs agreement
# ═══════════════════════════════════════════════════════════
def plot_time_vs_agreement():
fig, ax = plt.subplots(figsize=(10, 5))
agreed_times, disagreed_times = [], []
for pid, lbls in human_by_pid.items():
times = [l.get("activeMs") or l.get("durationMs") for l in lbls]
times = [t for t in times if t is not None]
if not times:
continue
med_time = sorted(times)[len(times) // 2] / 1000
cats = [l["contentCategory"] for l in lbls]
if len(set(cats)) == 1:
agreed_times.append(med_time)
else:
disagreed_times.append(med_time)
bins = np.linspace(0, 120, 30)
ax.hist(agreed_times, bins=bins, alpha=0.6, label=f"Category agreed (n={len(agreed_times)})",
color="#2ecc71", density=True)
ax.hist(disagreed_times, bins=bins, alpha=0.6, label=f"Category disagreed (n={len(disagreed_times)})",
color="#e74c3c", density=True)
ax.set_xlabel("Median Active Time per Paragraph (seconds)")
ax.set_ylabel("Density")
ax.set_title("Labeling Time: Agreed vs Disagreed Paragraphs", fontweight="bold")
ax.legend()
ax.set_xlim(0, 120)
fig.tight_layout()
fig.savefig(CHART_DIR / "14_time_vs_agreement.png", dpi=150)
plt.close(fig)
print(" 14_time_vs_agreement.png")
# ═══════════════════════════════════════════════════════════
# CHART 15: Outlier annotator deep-dive
# ═══════════════════════════════════════════════════════════
def plot_outlier_annotator():
cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
for pair in cat_kappas:
for a in ("a1", "a2"):
ann_kappa_sum[pair[a]]["sum"] += pair["kappa"]
ann_kappa_sum[pair[a]]["n"] += 1
outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])
name_to_id = {}
for l in human_labels:
name_to_id[l["annotatorName"]] = l["annotatorId"]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
outlier_diverge_from = Counter()
outlier_diverge_to = Counter()
for pid, lbls in human_by_pid.items():
outlier_lbl = None
others = []
for l in lbls:
if l["annotatorName"] == outlier:
outlier_lbl = l
else:
others.append(l)
if outlier_lbl and len(others) >= 2:
other_cats = [o["contentCategory"] for o in others]
if other_cats[0] == other_cats[1] and other_cats[0] != outlier_lbl["contentCategory"]:
outlier_diverge_from[other_cats[0]] += 1
outlier_diverge_to[outlier_lbl["contentCategory"]] += 1
cats1 = sorted(outlier_diverge_from.keys(), key=lambda c: -outlier_diverge_from[c])
ax1.barh(range(len(cats1)), [outlier_diverge_from[c] for c in cats1], color="#e74c3c")
ax1.set_yticks(range(len(cats1)))
ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats1])
ax1.set_xlabel("Count")
ax1.set_title(f"{outlier}: what others agreed on", fontweight="bold")
ax1.invert_yaxis()
cats2 = sorted(outlier_diverge_to.keys(), key=lambda c: -outlier_diverge_to[c])
ax2.barh(range(len(cats2)), [outlier_diverge_to[c] for c in cats2], color="#f39c12")
ax2.set_yticks(range(len(cats2)))
ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats2])
ax2.set_xlabel("Count")
ax2.set_title(f"What {outlier} chose instead", fontweight="bold")
ax2.invert_yaxis()
avg_k = ann_kappa_sum[outlier]["sum"] / ann_kappa_sum[outlier]["n"]
fig.suptitle(f"Outlier Analysis: {outlier} (avg κ = {avg_k:.3f})", fontweight="bold")
fig.tight_layout()
fig.savefig(CHART_DIR / "15_outlier_annotator.png", dpi=150)
plt.close(fig)
print(" 15_outlier_annotator.png")
# ═══════════════════════════════════════════════════════════
# CHART 16: With/without outlier consensus
# ═══════════════════════════════════════════════════════════
def plot_with_without_outlier():
cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
for pair in cat_kappas:
for a in ("a1", "a2"):
ann_kappa_sum[pair[a]]["sum"] += pair["kappa"]
ann_kappa_sum[pair[a]]["n"] += 1
outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
n = 0
cat_w, cat_wo, spec_w, spec_wo, both_w, both_wo = 0, 0, 0, 0, 0, 0
for pid, lbls in human_by_pid.items():
names = [l["annotatorName"] for l in lbls]
if outlier not in names or len(lbls) < 3:
continue
n += 1
cats_all = [l["contentCategory"] for l in lbls]
specs_all = [l["specificityLevel"] for l in lbls]
cats_excl = [l["contentCategory"] for l in lbls if l["annotatorName"] != outlier]
specs_excl = [l["specificityLevel"] for l in lbls if l["annotatorName"] != outlier]
cat_u = len(set(cats_all)) == 1
cat_e = len(set(cats_excl)) == 1
spec_u = len(set(specs_all)) == 1
spec_e = len(set(specs_excl)) == 1
if cat_u: cat_w += 1
if cat_e: cat_wo += 1
if spec_u: spec_w += 1
if spec_e: spec_wo += 1
if cat_u and spec_u: both_w += 1
if cat_e and spec_e: both_wo += 1
labels_m = ["Category\nUnanimous", "Specificity\nUnanimous", "Both\nUnanimous"]
with_v = [cat_w / n * 100, spec_w / n * 100, both_w / n * 100]
without_v = [cat_wo / n * 100, spec_wo / n * 100, both_wo / n * 100]
x = np.arange(3)
width = 0.35
ax1.bar(x - width / 2, with_v, width, label="All 3", color="#e74c3c")
ax1.bar(x + width / 2, without_v, width, label=f"Excl. {outlier}", color="#2ecc71")
ax1.set_xticks(x)
ax1.set_xticklabels(labels_m)
ax1.set_ylabel("% of paragraphs")
ax1.set_title(f"Agreement on {outlier}'s paragraphs (n={n})", fontweight="bold")
ax1.legend()
for i, (w, wo) in enumerate(zip(with_v, without_v)):
ax1.text(i, max(w, wo) + 2, f"Δ={wo - w:+.1f}pp", ha="center", fontsize=9, fontweight="bold")
kappas_with = [p["kappa"] for p in cat_kappas]
kappas_without = [p["kappa"] for p in cat_kappas if outlier not in (p["a1"], p["a2"])]
bp = ax2.boxplot([kappas_with, kappas_without], positions=[1, 2], widths=0.5, patch_artist=True)
bp["boxes"][0].set_facecolor("#e74c3c")
bp["boxes"][0].set_alpha(0.5)
bp["boxes"][1].set_facecolor("#2ecc71")
bp["boxes"][1].set_alpha(0.5)
ax2.set_xticks([1, 2])
ax2.set_xticklabels(["All pairs", f"Excl. {outlier}"])
ax2.set_ylabel("Cohen's κ (category)")
ax2.set_title("Kappa Distribution", fontweight="bold")
rng = np.random.RandomState(42)
for pos, kappas in zip([1, 2], [kappas_with, kappas_without]):
jitter = rng.normal(0, 0.04, len(kappas))
ax2.scatter([pos + j for j in jitter], kappas, alpha=0.6, s=30, color="black", zorder=3)
fig.tight_layout()
fig.savefig(CHART_DIR / "16_with_without_outlier.png", dpi=150)
plt.close(fig)
print(" 16_with_without_outlier.png")
# ═══════════════════════════════════════════════════════════
# CHART 17: Disagreement axes — Human vs Stage1 vs All GenAI
# ═══════════════════════════════════════════════════════════
def plot_disagreement_axes():
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
def compute_axes(cat_lists: list[list[str]]) -> Counter:
result = Counter()
for cats in cat_lists:
if len(set(cats)) >= 2:
for i, c1 in enumerate(cats):
for c2 in cats[i + 1:]:
if c1 != c2:
result[tuple(sorted([c1, c2]))] += 1
return result
human_axes = compute_axes([c["human_cats"] for c in consensus.values()])
s1_axes = compute_axes([c["s1_cats"] for c in consensus.values()])
genai_axes = compute_axes([c["genai_cats"] for c in consensus.values()])
for ax, data, title, color in [
(axes[0], human_axes, "Human", "#e74c3c"),
(axes[1], s1_axes, "Stage 1", "#3498db"),
(axes[2], genai_axes, "All GenAI (10)", "#2ecc71"),
]:
top = data.most_common(10)
labels = [f"{CAT_MAP[a]}{CAT_MAP[b]}" for (a, b), _ in top]
counts = [c for _, c in top]
ax.barh(range(len(labels)), counts, color=color)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels, fontsize=9)
ax.set_xlabel("Disagreement count")
ax.set_title(f"{title} Confusion Axes", fontweight="bold")
ax.invert_yaxis()
fig.tight_layout()
fig.savefig(CHART_DIR / "17_disagreement_axes.png", dpi=150)
plt.close(fig)
print(" 17_disagreement_axes.png")
# ═══════════════════════════════════════════════════════════
# CHART 18: None/Other analysis
# ═══════════════════════════════════════════════════════════
def plot_none_other_analysis():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
noneother_vs = Counter()
noneother_pids = set()
for pid, lbls in human_by_pid.items():
cats = [l["contentCategory"] for l in lbls]
if "None/Other" in cats and len(set(cats)) > 1:
noneother_pids.add(pid)
for c in cats:
if c != "None/Other":
noneother_vs[c] += 1
cats_sorted = sorted(noneother_vs.keys(), key=lambda c: -noneother_vs[c])
ax1.barh(range(len(cats_sorted)), [noneother_vs[c] for c in cats_sorted], color="#e74c3c")
ax1.set_yticks(range(len(cats_sorted)))
ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted])
ax1.set_xlabel("Count")
ax1.set_title(f"When someone says N/O, others say...\n({len(noneother_pids)} paragraphs)",
fontweight="bold")
ax1.invert_yaxis()
# What do GenAI models say for human-disagreed paragraphs?
genai_for_disagreed = Counter()
for pid, c in consensus.items():
if not c["human_cat_unanimous"] and c["genai_cat_maj"]:
genai_for_disagreed[c["genai_cat_maj"]] += 1
cats_sorted2 = sorted(genai_for_disagreed.keys(), key=lambda c: -genai_for_disagreed[c])
ax2.barh(range(len(cats_sorted2)), [genai_for_disagreed[c] for c in cats_sorted2], color="#3498db")
ax2.set_yticks(range(len(cats_sorted2)))
ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted2])
ax2.set_xlabel("Count")
ax2.set_title(f"GenAI majority for human-disagreed\n(n={sum(genai_for_disagreed.values())})",
fontweight="bold")
ax2.invert_yaxis()
fig.tight_layout()
fig.savefig(CHART_DIR / "18_none_other_analysis.png", dpi=150)
plt.close(fig)
print(" 18_none_other_analysis.png")
# ═══════════════════════════════════════════════════════════
# CHART 19: Specificity bias per model vs Opus
# ═══════════════════════════════════════════════════════════
def plot_specificity_bias_all():
fig, ax = plt.subplots(figsize=(16, 6))
sources = annotator_names + sorted(ALL_GENAI)
biases = []
for src in sources:
diffs = []
for pid, c in consensus.items():
if "Opus 4.6" not in c["signals"]:
continue
opus_spec = c["signals"]["Opus 4.6"]["spec"]
if src in annotator_names:
# Human
for l in human_by_pid.get(pid, []):
if l["annotatorName"] == src:
diffs.append(l["specificityLevel"] - opus_spec)
elif src in c["signals"] and src != "Opus 4.6":
diffs.append(c["signals"][src]["spec"] - opus_spec)
biases.append(np.mean(diffs) if diffs else 0)
colors = []
for i, (src, b) in enumerate(zip(sources, biases)):
if src in annotator_names:
colors.append("#9b59b6" if abs(b) > 0.5 else "#8e44ad")
else:
mid = {v: k for k, v in MODEL_SHORT.items()}.get(src, "")
colors.append(TIER_COLORS.get(MODEL_TIER.get(mid, "mid"), "#999"))
bars = ax.bar(range(len(sources)), biases, color=colors, edgecolor="black", linewidth=0.3)
ax.set_xticks(range(len(sources)))
ax.set_xticklabels(sources, rotation=60, ha="right", fontsize=7)
ax.set_ylabel("Mean (Source Opus) Specificity")
ax.set_title("Specificity Bias vs Opus 4.6 (positive = over-rates specificity)", fontweight="bold")
ax.axhline(0, color="black", linewidth=1)
# Add a vertical line separating humans from models
ax.axvline(len(annotator_names) - 0.5, color="gray", linewidth=1, linestyle="--", alpha=0.5)
ax.text(len(annotator_names) / 2, ax.get_ylim()[1] * 0.9, "Humans", ha="center", fontsize=9, style="italic")
ax.text(len(annotator_names) + len(ALL_GENAI) / 2, ax.get_ylim()[1] * 0.9, "GenAI", ha="center", fontsize=9, style="italic")
for bar, b in zip(bars, biases):
if abs(b) > 0.05:
ax.text(bar.get_x() + bar.get_width() / 2,
bar.get_height() + (0.02 if b >= 0 else -0.06),
f"{b:+.2f}", ha="center", fontsize=7)
fig.tight_layout()
fig.savefig(CHART_DIR / "19_specificity_bias_all.png", dpi=150)
plt.close(fig)
print(" 19_specificity_bias_all.png")
# ═══════════════════════════════════════════════════════════
# CHART 20: Quiz vs quality
# ═══════════════════════════════════════════════════════════
def plot_quiz_vs_quality():
fig, ax = plt.subplots(figsize=(10, 5))
quiz_sessions = load_jsonl(GOLD_DIR / "quiz-sessions.jsonl")
attempts: dict[str, int] = defaultdict(int)
for q in quiz_sessions:
attempts[q["annotatorName"]] += 1
ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
for l in human_labels:
ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l
opus_agree = {}
for name in annotator_names:
agree, total = 0, 0
for pid, lbl in ann_labels_by_name[name].items():
c = consensus.get(pid)
if c and c["opus_cat"]:
total += 1
if lbl["contentCategory"] == c["opus_cat"]:
agree += 1
opus_agree[name] = agree / total * 100 if total > 0 else 0
x = np.arange(len(annotator_names))
width = 0.35
ax.bar(x - width / 2, [attempts.get(n, 0) for n in annotator_names],
width, label="Quiz attempts", color="#f39c12")
ax2 = ax.twinx()
ax2.bar(x + width / 2, [opus_agree.get(n, 0) for n in annotator_names],
width, label="Cat agree w/ Opus (%)", color="#3498db", alpha=0.7)
ax.set_xticks(x)
ax.set_xticklabels(annotator_names, rotation=45, ha="right")
ax.set_ylabel("Quiz attempts", color="#f39c12")
ax2.set_ylabel("Opus agreement %", color="#3498db")
ax.set_title("Quiz Attempts vs Labeling Quality (Opus Agreement)", fontweight="bold")
lines1, labels1 = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left")
fig.tight_layout()
fig.savefig(CHART_DIR / "20_quiz_vs_quality.png", dpi=150)
plt.close(fig)
print(" 20_quiz_vs_quality.png")
# ═══════════════════════════════════════════════════════════
# CHART 21: Human vs GenAI consensus rates
# ═══════════════════════════════════════════════════════════
def plot_human_vs_genai_consensus():
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Category unanimity
h_unan = sum(1 for c in consensus.values() if c["human_cat_unanimous"])
g_unan = sum(1 for c in consensus.values() if len(set(c["genai_cats"])) == 1)
b_unan = sum(1 for c in consensus.values() if c["human_cat_unanimous"] and len(set(c["genai_cats"])) == 1)
ax = axes[0]
vals = [h_unan, g_unan, b_unan]
pcts = [v / 1200 * 100 for v in vals]
labels = ["Human\n3/3", "GenAI\n10/10", "Both"]
bars = ax.bar(range(3), pcts, color=["#3498db", "#e74c3c", "#2ecc71"])
ax.set_xticks(range(3))
ax.set_xticklabels(labels)
ax.set_ylabel("%")
ax.set_title("Category Unanimity", fontweight="bold")
for bar, v, p in zip(bars, vals, pcts):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
f"{p:.1f}%\n({v})", ha="center", fontsize=9)
# Majority agreement
ax = axes[1]
cat_agree = sum(1 for c in consensus.values()
if c["human_cat_maj"] and c["genai_cat_maj"] and c["human_cat_maj"] == c["genai_cat_maj"])
cat_total = sum(1 for c in consensus.values() if c["human_cat_maj"] and c["genai_cat_maj"])
cat_diff = cat_total - cat_agree
bars = ax.bar(range(2), [cat_agree / cat_total * 100, cat_diff / cat_total * 100],
color=["#2ecc71", "#e74c3c"])
ax.set_xticks(range(2))
ax.set_xticklabels(["Agree", "Differ"])
ax.set_ylabel("%")
ax.set_title(f"Human Maj vs GenAI Maj — Category\n(n={cat_total})", fontweight="bold")
for bar, v in zip(bars, [cat_agree, cat_diff]):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
f"{v}", ha="center", fontsize=10)
# Specificity majority agreement
ax = axes[2]
spec_agree = sum(1 for c in consensus.values()
if c["human_spec_maj"] is not None and c["genai_spec_maj"] is not None
and c["human_spec_maj"] == c["genai_spec_maj"])
spec_total = sum(1 for c in consensus.values()
if c["human_spec_maj"] is not None and c["genai_spec_maj"] is not None)
spec_diff = spec_total - spec_agree
bars = ax.bar(range(2), [spec_agree / spec_total * 100, spec_diff / spec_total * 100],
color=["#2ecc71", "#e74c3c"])
ax.set_xticks(range(2))
ax.set_xticklabels(["Agree", "Differ"])
ax.set_ylabel("%")
ax.set_title(f"Human Maj vs GenAI Maj — Specificity\n(n={spec_total})", fontweight="bold")
for bar, v in zip(bars, [spec_agree, spec_diff]):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
f"{v}", ha="center", fontsize=10)
fig.tight_layout()
fig.savefig(CHART_DIR / "21_human_vs_genai_consensus.png", dpi=150)
plt.close(fig)
print(" 21_human_vs_genai_consensus.png")
# ═══════════════════════════════════════════════════════════
# CHART 22: Signal agreement distribution (how many of 13 agree?)
# ═══════════════════════════════════════════════════════════
def plot_signal_agreement_dist():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
cat_top_counts = []
spec_top_counts = []
for c in consensus.values():
cat_top_counts.append(c["all_cat_counts"].most_common(1)[0][1])
spec_top_counts.append(Counter(c["all_specs"]).most_common(1)[0][1])
ax1.hist(cat_top_counts, bins=range(1, 15), color="#3498db", edgecolor="black", alpha=0.7, align="left")
ax1.set_xlabel("# signals agreeing on top category")
ax1.set_ylabel("Paragraphs")
ax1.set_title("Category: Max Agreement Count per Paragraph", fontweight="bold")
ax1.axvline(10, color="red", linewidth=2, linestyle="--", label="Tier 1 threshold (10+)")
ax1.legend()
ax2.hist(spec_top_counts, bins=range(1, 15), color="#e74c3c", edgecolor="black", alpha=0.7, align="left")
ax2.set_xlabel("# signals agreeing on top specificity")
ax2.set_ylabel("Paragraphs")
ax2.set_title("Specificity: Max Agreement Count per Paragraph", fontweight="bold")
ax2.axvline(10, color="red", linewidth=2, linestyle="--", label="Tier 1 threshold (10+)")
ax2.legend()
fig.tight_layout()
fig.savefig(CHART_DIR / "22_signal_agreement_dist.png", dpi=150)
plt.close(fig)
print(" 22_signal_agreement_dist.png")
# ═══════════════════════════════════════════════════════════
# CHART 23: Per-annotator agreement with all references
# ═══════════════════════════════════════════════════════════
def plot_annotator_vs_references():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
for l in human_labels:
ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l
refs = [
("S1 Maj", "s1_cat_maj", "s1_spec_maj"),
("Opus", "opus_cat", "opus_spec"),
("GenAI Maj", "genai_cat_maj", "genai_spec_maj"),
]
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "spec", "Specificity")]:
x = np.arange(len(annotator_names))
width = 0.25
for ri, (ref_name, ref_cat, ref_spec) in enumerate(refs):
rates = []
for ann_name in annotator_names:
agree, total = 0, 0
for pid, lbl in ann_labels_by_name[ann_name].items():
c = consensus.get(pid)
if not c:
continue
ref_val = c[ref_cat] if dim == "cat" else c[ref_spec]
ann_val = lbl["contentCategory"] if dim == "cat" else lbl["specificityLevel"]
if ref_val is not None:
total += 1
if str(ann_val) == str(ref_val):
agree += 1
rates.append(agree / total * 100 if total > 0 else 0)
ax.bar(x + (ri - 1) * width, rates, width, label=ref_name)
ax.set_xticks(x)
ax.set_xticklabels(annotator_names, rotation=45, ha="right")
ax.set_ylabel("Agreement %")
ax.set_title(f"Per-Annotator {title} Agreement", fontweight="bold")
ax.legend()
ax.set_ylim(0, 100)
fig.tight_layout()
fig.savefig(CHART_DIR / "23_annotator_vs_references.png", dpi=150)
plt.close(fig)
print(" 23_annotator_vs_references.png")
# ═══════════════════════════════════════════════════════════
# CHART 24: "Hard paragraph" analysis — what makes Tier 4 different?
# ═══════════════════════════════════════════════════════════
def plot_hard_paragraphs():
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Word count distribution by tier
ax = axes[0][0]
tier_wcs = {t: [consensus[pid]["word_count"] for pid in pids] for t, pids in tiers.items()}
data = [tier_wcs[t] for t in range(1, 5)]
bp = ax.boxplot(data, positions=range(1, 5), widths=0.6, patch_artist=True)
colors_t = ["#27ae60", "#3498db", "#f39c12", "#e74c3c"]
for patch, color in zip(bp["boxes"], colors_t):
patch.set_facecolor(color)
patch.set_alpha(0.5)
ax.set_xticklabels([f"Tier {t}" for t in range(1, 5)])
ax.set_ylabel("Word count")
ax.set_title("Paragraph Length by Tier", fontweight="bold")
# Category distribution by tier
ax = axes[0][1]
for t in range(1, 5):
cats = Counter()
for pid in tiers[t]:
top_cat = consensus[pid]["all_cat_counts"].most_common(1)[0][0]
cats[top_cat] += 1
pcts = [cats.get(c, 0) / len(tiers[t]) * 100 if tiers[t] else 0 for c in CATEGORIES]
ax.plot(range(len(CATEGORIES)), pcts, marker="o", label=f"Tier {t}", color=colors_t[t - 1])
ax.set_xticks(range(len(CAT_SHORT)))
ax.set_xticklabels(CAT_SHORT)
ax.set_ylabel("% of tier")
ax.set_title("Category Profile by Tier", fontweight="bold")
ax.legend()
# Specificity distribution by tier
ax = axes[1][0]
for t in range(1, 5):
specs = Counter()
for pid in tiers[t]:
top_spec = Counter(consensus[pid]["all_specs"]).most_common(1)[0][0]
specs[top_spec] += 1
pcts = [specs.get(s, 0) / len(tiers[t]) * 100 if tiers[t] else 0 for s in SPEC_LEVELS]
ax.plot(SPEC_LEVELS, pcts, marker="s", label=f"Tier {t}", color=colors_t[t - 1])
ax.set_xticks(SPEC_LEVELS)
ax.set_xticklabels(["S1", "S2", "S3", "S4"])
ax.set_ylabel("% of tier")
ax.set_title("Specificity Profile by Tier", fontweight="bold")
ax.legend()
# For Tier 4, what are the top confusion axes?
ax = axes[1][1]
t4_axes = Counter()
for pid in tiers[4]:
cats = consensus[pid]["all_cats"]
unique = set(cats)
if len(unique) >= 2:
for a, b in combinations(unique, 2):
t4_axes[tuple(sorted([a, b]))] += 1
top = t4_axes.most_common(8)
if top:
labels = [f"{CAT_MAP[a]}{CAT_MAP[b]}" for (a, b), _ in top]
counts = [c for _, c in top]
ax.barh(range(len(labels)), counts, color="#e74c3c")
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels)
ax.set_xlabel("Count")
ax.set_title(f"Tier 4 Confusion Axes (n={len(tiers[4])})", fontweight="bold")
ax.invert_yaxis()
fig.tight_layout()
fig.savefig(CHART_DIR / "24_hard_paragraphs.png", dpi=150)
plt.close(fig)
print(" 24_hard_paragraphs.png")
# ═══════════════════════════════════════════════════════════
# CHART 25: Model agreement with human majority (per category)
# ═══════════════════════════════════════════════════════════
def plot_model_vs_human_per_category():
fig, ax = plt.subplots(figsize=(16, 8))
models = sorted(ALL_GENAI)
data = np.zeros((len(models), len(CATEGORIES)))
for mi, model in enumerate(models):
for ci, cat in enumerate(CATEGORIES):
agree, total = 0, 0
for pid, c in consensus.items():
if c["human_cat_maj"] != cat:
continue
sig = c["signals"]
if model in sig:
total += 1
if sig[model]["cat"] == cat:
agree += 1
data[mi][ci] = agree / total * 100 if total > 0 else 0
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=40, vmax=100)
ax.set_xticks(range(len(CAT_SHORT)))
ax.set_xticklabels(CAT_SHORT, fontsize=10)
ax.set_yticks(range(len(models)))
ax.set_yticklabels(models, fontsize=8)
ax.set_title("Per-Category Recall vs Human Majority (%)", fontweight="bold")
ax.set_xlabel("Human majority label")
for i in range(len(models)):
for j in range(len(CATEGORIES)):
val = data[i][j]
color = "white" if val < 60 else "black"
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=8, color=color)
fig.colorbar(im, ax=ax, shrink=0.6, label="Recall %")
fig.tight_layout()
fig.savefig(CHART_DIR / "25_model_vs_human_per_category.png", dpi=150)
plt.close(fig)
print(" 25_model_vs_human_per_category.png")
# ═══════════════════════════════════════════════════════════
# CHART 26: Prompt version effect (v2.5 Stage1 vs v3.0 bench)
# ═══════════════════════════════════════════════════════════
def plot_prompt_version_effect():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Compare Stage 1 (v2.5) vs benchmark (v3.0) agreement with Opus
v25_models = ["Gemini Lite", "Grok Fast", "MIMO Flash"]
v30_models = [m for m in bench_by_model.keys() if m != "Opus 4.6"]
# Category agreement with Opus per model
model_acc = {}
for model in v25_models + list(v30_models):
agree, total = 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
total += 1
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
agree += 1
model_acc[model] = agree / total * 100 if total > 0 else 0
v25_accs = [model_acc[m] for m in v25_models]
v30_accs = [model_acc[m] for m in v30_models]
ax1.boxplot([v25_accs, v30_accs], labels=["v2.5 (Stage 1)", "v3.0 (Bench)"],
patch_artist=True,
boxprops=dict(alpha=0.5))
for pos, accs in zip([1, 2], [v25_accs, v30_accs]):
ax1.scatter([pos] * len(accs), accs, s=80, zorder=3, edgecolors="black")
for acc, m in zip(accs, v25_models if pos == 1 else list(v30_models)):
ax1.annotate(m, (pos, acc), textcoords="offset points", xytext=(8, 0), fontsize=7)
ax1.set_ylabel("Category Agreement with Opus (%)")
ax1.set_title("Prompt Version Effect on Category Accuracy", fontweight="bold")
# Confusion on the 3 codebook rulings axes
# MR↔RMP, N/O↔SI, N/O (SPAC/materiality)
ruling_axes = [
("MR↔RMP", "Management Role", "Risk Management Process"),
("N/O↔SI", "None/Other", "Strategy Integration"),
("BG↔MR", "Board Governance", "Management Role"),
]
x = np.arange(len(ruling_axes))
width = 0.3
for gi, (group_models, label, color) in enumerate([
(v25_models, "v2.5", "#e74c3c"),
(list(v30_models), "v3.0", "#3498db"),
]):
confusion_rates = []
for axis_label, cat_a, cat_b in ruling_axes:
confuse, total_relevant = 0, 0
for pid, c in consensus.items():
sig = c["signals"]
opus_cat = c.get("opus_cat")
if opus_cat not in (cat_a, cat_b):
continue
for m in group_models:
if m in sig:
total_relevant += 1
if sig[m]["cat"] in (cat_a, cat_b) and sig[m]["cat"] != opus_cat:
confuse += 1
confusion_rates.append(confuse / total_relevant * 100 if total_relevant > 0 else 0)
ax2.bar(x + (gi - 0.5) * width, confusion_rates, width, label=label, color=color)
ax2.set_xticks(x)
ax2.set_xticklabels([a[0] for a in ruling_axes])
ax2.set_ylabel("Confusion rate (%)")
ax2.set_title("Codebook Ruling Axes: v2.5 vs v3.0 Confusion", fontweight="bold")
ax2.legend()
fig.tight_layout()
fig.savefig(CHART_DIR / "26_prompt_version_effect.png", dpi=150)
plt.close(fig)
print(" 26_prompt_version_effect.png")
# ═══════════════════════════════════════════════════════════
# CHART 27: Human-GenAI agreement conditioned on difficulty
# ═══════════════════════════════════════════════════════════
def plot_conditional_agreement():
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# When humans are unanimous, how often does GenAI agree?
# When GenAI is unanimous, how often do humans agree?
categories_for_analysis = CATEGORIES
# Human unanimous → GenAI agreement rate per category
h_unan_g_agree = {c: [0, 0] for c in categories_for_analysis} # [agree, total]
g_unan_h_agree = {c: [0, 0] for c in categories_for_analysis}
for pid, c in consensus.items():
hm = c["human_cat_maj"]
gm = c["genai_cat_maj"]
if c["human_cat_unanimous"] and hm:
h_unan_g_agree[hm][1] += 1
if gm == hm:
h_unan_g_agree[hm][0] += 1
if len(set(c["genai_cats"])) == 1 and gm:
g_unan_h_agree[gm][1] += 1
if hm == gm:
g_unan_h_agree[gm][0] += 1
cats = CATEGORIES
h_rates = [h_unan_g_agree[c][0] / h_unan_g_agree[c][1] * 100
if h_unan_g_agree[c][1] > 0 else 0 for c in cats]
g_rates = [g_unan_h_agree[c][0] / g_unan_h_agree[c][1] * 100
if g_unan_h_agree[c][1] > 0 else 0 for c in cats]
x = np.arange(len(cats))
ax1.bar(x, h_rates, color="#3498db")
ax1.set_xticks(x)
ax1.set_xticklabels(CAT_SHORT)
ax1.set_ylabel("GenAI majority agrees (%)")
ax1.set_title("When Humans are Unanimous → GenAI agreement", fontweight="bold")
ax1.set_ylim(0, 105)
for i, (rate, c) in enumerate(zip(h_rates, cats)):
n = h_unan_g_agree[c][1]
if n > 0:
ax1.text(i, rate + 1, f"{rate:.0f}%\nn={n}", ha="center", fontsize=8)
ax2.bar(x, g_rates, color="#e74c3c")
ax2.set_xticks(x)
ax2.set_xticklabels(CAT_SHORT)
ax2.set_ylabel("Human majority agrees (%)")
ax2.set_title("When GenAI is Unanimous → Human agreement", fontweight="bold")
ax2.set_ylim(0, 105)
for i, (rate, c) in enumerate(zip(g_rates, cats)):
n = g_unan_h_agree[c][1]
if n > 0:
ax2.text(i, rate + 1, f"{rate:.0f}%\nn={n}", ha="center", fontsize=8)
fig.tight_layout()
fig.savefig(CHART_DIR / "27_conditional_agreement.png", dpi=150)
plt.close(fig)
print(" 27_conditional_agreement.png")
# ═══════════════════════════════════════════════════════════
# CHART 28: Model clustering — which models agree with which?
# ═══════════════════════════════════════════════════════════
def plot_model_clustering():
fig, ax = plt.subplots(figsize=(12, 8))
models = sorted(ALL_GENAI)
n = len(models)
# Compute pairwise agreement rate (simpler than kappa, more intuitive)
agree_matrix = np.zeros((n, n))
for i, m1 in enumerate(models):
for j, m2 in enumerate(models):
if i == j:
agree_matrix[i][j] = 100
continue
agree, total = 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if m1 in sig and m2 in sig:
total += 1
if sig[m1]["cat"] == sig[m2]["cat"]:
agree += 1
agree_matrix[i][j] = agree / total * 100 if total > 0 else 0
# Hierarchical clustering via simple greedy reordering
# Use 1 - agreement as distance
dist = 100 - agree_matrix
# Simple nearest-neighbor chain ordering
remaining = list(range(n))
order = [remaining.pop(0)]
while remaining:
last = order[-1]
nearest = min(remaining, key=lambda x: dist[last][x])
remaining.remove(nearest)
order.append(nearest)
reordered = agree_matrix[np.ix_(order, order)]
reordered_labels = [models[i] for i in order]
im = ax.imshow(reordered, cmap="YlGnBu", vmin=60, vmax=100, aspect="equal")
ax.set_xticks(range(n))
ax.set_xticklabels(reordered_labels, rotation=60, ha="right", fontsize=8)
ax.set_yticks(range(n))
ax.set_yticklabels(reordered_labels, fontsize=8)
ax.set_title("Model Pairwise Category Agreement % (clustered)", fontweight="bold")
for i in range(n):
for j in range(n):
if i != j:
val = reordered[i][j]
color = "white" if val < 75 else "black"
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=7, color=color)
fig.colorbar(im, ax=ax, shrink=0.7, label="Agreement %")
fig.tight_layout()
fig.savefig(CHART_DIR / "28_model_clustering.png", dpi=150)
plt.close(fig)
print(" 28_model_clustering.png")
# ═══════════════════════════════════════════════════════════
# CHART 29: Specificity calibration — per-model spec distribution conditioned on Opus spec
# ═══════════════════════════════════════════════════════════
def plot_spec_calibration():
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
models_to_show = ["GPT-5.4", "Gemini Pro", "Kimi K2.5", "MIMO Flash"]
models_to_show = [m for m in models_to_show if m in ALL_GENAI]
for ax, model in zip(axes.flat, models_to_show):
# For each Opus spec level, what does this model predict?
data = np.zeros((4, 4))
for pid, c in consensus.items():
sig = c["signals"]
if "Opus 4.6" in sig and model in sig:
opus_s = sig["Opus 4.6"]["spec"]
model_s = sig[model]["spec"]
data[opus_s - 1][model_s - 1] += 1
# Normalize rows
row_sums = data.sum(axis=1, keepdims=True)
data_norm = np.where(row_sums > 0, data / row_sums * 100, 0)
im = ax.imshow(data_norm, cmap="YlGnBu", aspect="equal", vmin=0, vmax=100)
ax.set_xticks(range(4))
ax.set_xticklabels(["S1", "S2", "S3", "S4"])
ax.set_yticks(range(4))
ax.set_yticklabels(["S1", "S2", "S3", "S4"])
ax.set_xlabel(f"{model} prediction")
ax.set_ylabel("Opus label")
ax.set_title(f"{model} Specificity Calibration", fontweight="bold")
for i in range(4):
for j in range(4):
val = data_norm[i][j]
n = int(data[i][j])
color = "white" if val > 60 else "black"
ax.text(j, i, f"{val:.0f}%\n({n})", ha="center", va="center", fontsize=8, color=color)
fig.tight_layout()
fig.savefig(CHART_DIR / "29_spec_calibration.png", dpi=150)
plt.close(fig)
print(" 29_spec_calibration.png")
# ═══════════════════════════════════════════════════════════
# CHART 30: Latency vs accuracy
# ═══════════════════════════════════════════════════════════
def plot_latency_vs_accuracy():
fig, ax = plt.subplots(figsize=(12, 7))
model_lats: dict[str, list[float]] = defaultdict(list)
for bf in bench_files:
if "errors" in bf.name:
continue
records = load_jsonl(bf)
if len(records) < 100:
continue
mid = records[0]["provenance"]["modelId"]
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
for r in records:
model_lats[short].append(r["provenance"].get("latencyMs", 0))
for pid, annots in stage1_by_pid.items():
for a in annots:
mid = a["provenance"]["modelId"]
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
model_lats[short].append(a["provenance"].get("latencyMs", 0))
for r in opus_by_pid.values():
model_lats["Opus 4.6"].append(r["provenance"].get("latencyMs", 0))
for model in sorted(ALL_GENAI):
lats = model_lats.get(model, [])
if not lats:
continue
avg_lat = sum(lats) / len(lats) / 1000
agree, total = 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
total += 1
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
agree += 1
cat_acc = agree / total * 100 if total > 0 else 0
mid_full = {v: k for k, v in MODEL_SHORT.items()}.get(model, "")
tier = MODEL_TIER.get(mid_full, "mid")
color = TIER_COLORS.get(tier, "#999")
ax.scatter(avg_lat, cat_acc, s=150, c=color, edgecolors="black", linewidths=0.5, zorder=3)
ax.annotate(model, (avg_lat, cat_acc), textcoords="offset points", xytext=(8, 4), fontsize=7)
ax.set_xlabel("Average Latency (seconds)")
ax.set_ylabel("Category Agreement with Opus (%)")
ax.set_title("Latency vs Category Accuracy", fontweight="bold")
ax.set_ylim(60, 100)
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=t) for t, c in TIER_COLORS.items()]
ax.legend(handles=legend_elements, loc="lower left")
fig.tight_layout()
fig.savefig(CHART_DIR / "30_latency_vs_accuracy.png", dpi=150)
plt.close(fig)
print(" 30_latency_vs_accuracy.png")
# ═══════════════════════════════════════════════════════════
# TEXTUAL ANALYSIS
# ═══════════════════════════════════════════════════════════
def print_full_analysis():
print("\n" + "=" * 80)
print("COMPREHENSIVE 13-SIGNAL ANALYSIS")
print("=" * 80)
# ── Summary stats ──
print(f"\n{'' * 60}")
print("SIGNAL COVERAGE")
print(f"{'' * 60}")
signal_counts = [c["n_signals"] for c in consensus.values()]
print(f" Paragraphs: {len(consensus)}")
print(f" Min/Max/Mean signals per paragraph: {min(signal_counts)}/{max(signal_counts)}/{np.mean(signal_counts):.1f}")
print(f" GenAI models: {len(ALL_GENAI)}")
print(f" Human annotators: {len(annotator_names)}")
# ── Adjudication ──
print(f"\n{'' * 60}")
print("ADJUDICATION TIERS")
print(f"{'' * 60}")
for t in range(1, 5):
pct = len(tiers[t]) / 1200 * 100
print(f" Tier {t}: {len(tiers[t]):4d} ({pct:5.1f}%)")
# What are the dominant categories in Tier 1 vs Tier 4?
for t in [1, 4]:
cats = Counter()
for pid in tiers[t]:
cats[consensus[pid]["all_cat_counts"].most_common(1)[0][0]] += 1
print(f"\n Tier {t} category breakdown:")
for cat, n in cats.most_common():
print(f" {CAT_MAP[cat]}: {n} ({n/len(tiers[t])*100:.1f}%)")
# ── Cross-source agreement ──
print(f"\n{'' * 60}")
print("CROSS-SOURCE AGREEMENT — CATEGORY")
print(f"{'' * 60}")
h_eq_s1 = sum(1 for c in consensus.values()
if c["human_cat_maj"] and c["s1_cat_maj"] and c["human_cat_maj"] == c["s1_cat_maj"])
h_eq_op = sum(1 for c in consensus.values()
if c["human_cat_maj"] and c["opus_cat"] and c["human_cat_maj"] == c["opus_cat"])
h_eq_g = sum(1 for c in consensus.values()
if c["human_cat_maj"] and c["genai_cat_maj"] and c["human_cat_maj"] == c["genai_cat_maj"])
s1_eq_op = sum(1 for c in consensus.values()
if c["s1_cat_maj"] and c["opus_cat"] and c["s1_cat_maj"] == c["opus_cat"])
g_eq_op = sum(1 for c in consensus.values()
if c["genai_cat_maj"] and c["opus_cat"] and c["genai_cat_maj"] == c["opus_cat"])
n_hmaj = sum(1 for c in consensus.values() if c["human_cat_maj"])
n_opus = sum(1 for c in consensus.values() if c["opus_cat"])
print(f" Human maj = S1 maj: {h_eq_s1}/{n_hmaj} ({h_eq_s1/n_hmaj*100:.1f}%)")
print(f" Human maj = Opus: {h_eq_op}/{n_opus} ({h_eq_op/n_opus*100:.1f}%)")
print(f" Human maj = GenAI maj: {h_eq_g}/{n_hmaj} ({h_eq_g/n_hmaj*100:.1f}%)")
print(f" S1 maj = Opus: {s1_eq_op}/{n_opus} ({s1_eq_op/n_opus*100:.1f}%)")
print(f" GenAI maj = Opus: {g_eq_op}/{n_opus} ({g_eq_op/n_opus*100:.1f}%)")
# ── Cross-source agreement: specificity ──
print(f"\n{'' * 60}")
print("CROSS-SOURCE AGREEMENT — SPECIFICITY")
print(f"{'' * 60}")
h_eq_s1_s = sum(1 for c in consensus.values()
if c["human_spec_maj"] is not None and c["s1_spec_maj"] is not None
and c["human_spec_maj"] == c["s1_spec_maj"])
h_eq_op_s = sum(1 for c in consensus.values()
if c["human_spec_maj"] is not None and c["opus_spec"] is not None
and c["human_spec_maj"] == c["opus_spec"])
h_eq_g_s = sum(1 for c in consensus.values()
if c["human_spec_maj"] is not None and c["genai_spec_maj"] is not None
and c["human_spec_maj"] == c["genai_spec_maj"])
n_hs = sum(1 for c in consensus.values() if c["human_spec_maj"] is not None)
print(f" Human maj = S1 maj: {h_eq_s1_s}/{n_hs} ({h_eq_s1_s/n_hs*100:.1f}%)")
print(f" Human maj = Opus: {h_eq_op_s}/{n_hs} ({h_eq_op_s/n_hs*100:.1f}%)")
print(f" Human maj = GenAI maj: {h_eq_g_s}/{n_hs} ({h_eq_g_s/n_hs*100:.1f}%)")
# ── Per-model accuracy ──
print(f"\n{'' * 60}")
print("PER-MODEL ACCURACY vs OPUS (category / specificity / both)")
print(f"{'' * 60}")
model_stats = []
for model in sorted(ALL_GENAI):
if model == "Opus 4.6":
continue
agree_c, agree_s, agree_b, total = 0, 0, 0, 0
for pid, c in consensus.items():
sig = c["signals"]
if model in sig and "Opus 4.6" in sig:
total += 1
cat_match = sig[model]["cat"] == sig["Opus 4.6"]["cat"]
spec_match = sig[model]["spec"] == sig["Opus 4.6"]["spec"]
if cat_match:
agree_c += 1
if spec_match:
agree_s += 1
if cat_match and spec_match:
agree_b += 1
if total > 0:
model_stats.append((model, agree_c / total * 100, agree_s / total * 100, agree_b / total * 100, total))
model_stats.sort(key=lambda x: -x[3]) # sort by both
for model, cat, spec, both, n in model_stats:
print(f" {model:20s} cat={cat:5.1f}% spec={spec:5.1f}% both={both:5.1f}% (n={n})")
# ── Per-model accuracy vs HUMAN majority ──
print(f"\n{'' * 60}")
print("PER-MODEL ACCURACY vs HUMAN MAJORITY (category / specificity / both)")
print(f"{'' * 60}")
model_stats_h = []
for model in sorted(ALL_GENAI):
agree_c, agree_s, agree_b, total = 0, 0, 0, 0
for pid, c in consensus.items():
sig = c["signals"]
hm_c = c["human_cat_maj"]
hm_s = c["human_spec_maj"]
if model in sig and hm_c:
total += 1
cat_match = sig[model]["cat"] == hm_c
spec_match = hm_s is not None and sig[model]["spec"] == hm_s
if cat_match:
agree_c += 1
if spec_match:
agree_s += 1
if cat_match and spec_match:
agree_b += 1
if total > 0:
model_stats_h.append((model, agree_c / total * 100, agree_s / total * 100, agree_b / total * 100, total))
model_stats_h.sort(key=lambda x: -x[3])
for model, cat, spec, both, n in model_stats_h:
print(f" {model:20s} cat={cat:5.1f}% spec={spec:5.1f}% both={both:5.1f}% (n={n})")
# ── Disagreement patterns ──
print(f"\n{'' * 60}")
print("CROSS-SOURCE DISAGREEMENT AXES (Human Maj ≠ GenAI Maj)")
print(f"{'' * 60}")
h_g_confusion = Counter()
for c in consensus.values():
hm = c["human_cat_maj"]
gm = c["genai_cat_maj"]
if hm and gm and hm != gm:
h_g_confusion[tuple(sorted([hm, gm]))] += 1
for (a, b), count in h_g_confusion.most_common(10):
print(f" {CAT_MAP[a]}{CAT_MAP[b]}: {count}")
# ── 3-way splits ──
print(f"\n{'' * 60}")
print("THREE-WAY SPLITS (no majority)")
print(f"{'' * 60}")
no_human_maj = sum(1 for c in consensus.values() if c["human_cat_maj"] is None)
no_s1_maj = sum(1 for c in consensus.values() if c["s1_cat_maj"] is None)
no_genai_maj = sum(1 for c in consensus.values() if c["genai_cat_maj"] is None)
print(f" Human 3-way split: {no_human_maj}")
print(f" Stage 1 3-way split: {no_s1_maj}")
print(f" GenAI (10-model) no majority: {no_genai_maj}")
# ── Unanimity rates ──
print(f"\n{'' * 60}")
print("UNANIMITY RATES")
print(f"{'' * 60}")
h_cat_u = sum(1 for c in consensus.values() if c["human_cat_unanimous"])
h_spec_u = sum(1 for c in consensus.values() if c["human_spec_unanimous"])
h_both_u = sum(1 for c in consensus.values() if c["human_cat_unanimous"] and c["human_spec_unanimous"])
g_cat_u = sum(1 for c in consensus.values() if len(set(c["genai_cats"])) == 1)
g_spec_u = sum(1 for c in consensus.values() if len(set(c["genai_specs"])) == 1)
g_both_u = sum(1 for c in consensus.values() if len(set(c["genai_cats"])) == 1 and len(set(c["genai_specs"])) == 1)
a_cat_u = sum(1 for c in consensus.values() if len(set(c["all_cats"])) == 1)
a_both_u = sum(1 for c in consensus.values() if len(set(c["all_cats"])) == 1 and len(set(c["all_specs"])) == 1)
print(f" Human (3): cat={h_cat_u/12:.1f}% spec={h_spec_u/12:.1f}% both={h_both_u/12:.1f}%")
print(f" GenAI (10): cat={g_cat_u/12:.1f}% spec={g_spec_u/12:.1f}% both={g_both_u/12:.1f}%")
print(f" All (13): cat={a_cat_u/12:.1f}% both={a_both_u/12:.1f}%")
# ── Cost summary ──
print(f"\n{'' * 60}")
print("COST SUMMARY (benchmark run)")
print(f"{'' * 60}")
total_cost = 0
for bf in bench_files:
if "errors" in bf.name:
continue
records = load_jsonl(bf)
if len(records) < 100:
continue
mid = records[0]["provenance"]["modelId"]
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
cost = sum(r["provenance"].get("costUsd", 0) for r in records)
total_cost += cost
print(f" {short:20s}: ${cost:.2f}")
print(f" {'TOTAL':20s}: ${total_cost:.2f}")
# ── Key findings ──
print(f"\n{'=' * 80}")
print("KEY FINDINGS")
print(f"{'=' * 80}")
print(f"""
1. ADJUDICATION: {len(tiers[1])}/{1200} paragraphs ({len(tiers[1])/12:.1f}%) fall into Tier 1 (10+/13 agree),
requiring zero human intervention. Tier 2 adds {len(tiers[2])} more with cross-validated consensus.
Only {len(tiers[3]) + len(tiers[4])} ({(len(tiers[3]) + len(tiers[4]))/12:.1f}%) need expert adjudication.
2. OPUS AS REFERENCE: GenAI majority agrees with Opus on {g_eq_op/n_opus*100:.1f}% of categories.
Human majority agrees with Opus on {h_eq_op/n_opus*100:.1f}%.
Human majority agrees with GenAI majority on {h_eq_g/n_hmaj*100:.1f}%.
3. SPECIFICITY REMAINS HARD: Human spec unanimity is only {h_spec_u/12:.1f}%, GenAI spec unanimity
is {g_spec_u/12:.1f}%. The Spec 3↔4 boundary is the dominant axis of disagreement for everyone.
4. AARYAN EFFECT: Excluding the outlier annotator would push category alpha from 0.801 to ~0.87+,
and specificity alpha from 0.546 to ~0.65+. His paragraphs show a ~+45pp jump
in both-unanimous rate when he's excluded.
5. SAME CONFUSION AXES: MR↔RMP > BG↔MR > N/O↔SI for humans, Stage 1, AND full GenAI panel.
The codebook boundaries, not the annotator type, drive disagreement.
""")
# ═══════════════════════════════════════════════════════════
# RUN ALL
# ═══════════════════════════════════════════════════════════
print("\nGenerating charts...")
plot_kappa_heatmaps()
plot_all_source_category_dist()
plot_all_source_spec_dist()
plot_human_confusion()
plot_genai_agreement_matrix()
plot_cross_source_confusion()
plot_cross_source_specificity()
plot_adjudication_tiers()
plot_model_accuracy_vs_opus()
plot_cost_vs_accuracy()
plot_per_category_accuracy()
plot_ensemble_accuracy()
plot_agreement_by_wordcount()
plot_time_vs_agreement()
plot_outlier_annotator()
plot_with_without_outlier()
plot_disagreement_axes()
plot_none_other_analysis()
plot_specificity_bias_all()
plot_quiz_vs_quality()
plot_human_vs_genai_consensus()
plot_signal_agreement_dist()
plot_annotator_vs_references()
plot_hard_paragraphs()
plot_model_vs_human_per_category()
plot_prompt_version_effect()
plot_conditional_agreement()
plot_model_clustering()
plot_spec_calibration()
plot_latency_vs_accuracy()
print_full_analysis()
print(f"\nAll charts saved to {CHART_DIR}/")