2199 lines
96 KiB
Python
2199 lines
96 KiB
Python
"""
|
||
Comprehensive 13-signal analysis of gold set holdout.
|
||
|
||
Sources (per paragraph):
|
||
3 human annotators (BIBD)
|
||
3 Stage 1 panel (gemini-flash-lite, mimo-v2-flash, grok-fast) — v2.5
|
||
1 Opus 4.6 golden — v3.0+codebook
|
||
6 benchmark models (gpt-5.4, kimi-k2.5, gemini-pro, glm-5, minimax-m2.7, mimo-pro) — v3.0
|
||
|
||
Outputs ~30 charts to data/gold/charts/ and detailed textual analysis to stdout.
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
from collections import Counter, defaultdict
|
||
from itertools import combinations
|
||
from pathlib import Path
|
||
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib.ticker as mticker
|
||
import numpy as np
|
||
|
||
# ── Paths ──
|
||
ROOT = Path("/home/joey/Documents/sec-cyBERT")
|
||
GOLD_DIR = ROOT / "data/gold"
|
||
CHART_DIR = GOLD_DIR / "charts"
|
||
STAGE1_PATH = ROOT / "data/annotations/stage1.patched.jsonl"
|
||
OPUS_PATH = ROOT / "data/annotations/golden/opus.jsonl"
|
||
BENCH_DIR = ROOT / "data/annotations/bench-holdout"
|
||
LABELS_PATH = GOLD_DIR / "human-labels-raw.jsonl"
|
||
METRICS_PATH = GOLD_DIR / "metrics.json"
|
||
|
||
CATEGORIES = [
|
||
"Board Governance", "Management Role", "Risk Management Process",
|
||
"Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
|
||
]
|
||
CAT_SHORT = ["BG", "MR", "RMP", "TPR", "ID", "SI", "N/O"]
|
||
CAT_MAP = dict(zip(CATEGORIES, CAT_SHORT))
|
||
CAT_IDX = {c: i for i, c in enumerate(CATEGORIES)}
|
||
SPEC_LEVELS = [1, 2, 3, 4]
|
||
|
||
CHART_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# ── Shared style ──
|
||
plt.rcParams.update({
|
||
"figure.facecolor": "white",
|
||
"axes.facecolor": "#fafafa",
|
||
"axes.grid": True,
|
||
"grid.alpha": 0.3,
|
||
"font.size": 10,
|
||
})
|
||
|
||
# Short display names for models
|
||
MODEL_SHORT = {
|
||
"google/gemini-3.1-flash-lite-preview": "Gemini Lite",
|
||
"x-ai/grok-4.1-fast": "Grok Fast",
|
||
"xiaomi/mimo-v2-flash": "MIMO Flash",
|
||
"anthropic/claude-opus-4-6": "Opus 4.6",
|
||
"openai/gpt-5.4": "GPT-5.4",
|
||
"moonshotai/kimi-k2.5": "Kimi K2.5",
|
||
"google/gemini-3.1-pro-preview": "Gemini Pro",
|
||
"z-ai/glm-5": "GLM-5",
|
||
"minimax/minimax-m2.7": "MiniMax M2.7",
|
||
"xiaomi/mimo-v2-pro": "MIMO Pro",
|
||
}
|
||
|
||
MODEL_TIER = {
|
||
"google/gemini-3.1-flash-lite-preview": "stage1",
|
||
"x-ai/grok-4.1-fast": "stage1",
|
||
"xiaomi/mimo-v2-flash": "stage1",
|
||
"anthropic/claude-opus-4-6": "frontier",
|
||
"openai/gpt-5.4": "frontier",
|
||
"moonshotai/kimi-k2.5": "frontier",
|
||
"google/gemini-3.1-pro-preview": "frontier",
|
||
"z-ai/glm-5": "mid",
|
||
"minimax/minimax-m2.7": "budget",
|
||
"xiaomi/mimo-v2-pro": "mid",
|
||
}
|
||
|
||
TIER_COLORS = {
|
||
"stage1": "#95a5a6",
|
||
"frontier": "#e74c3c",
|
||
"mid": "#f39c12",
|
||
"budget": "#27ae60",
|
||
}
|
||
|
||
|
||
def load_jsonl(path: Path) -> list[dict]:
|
||
records = []
|
||
with open(path) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
records.append(json.loads(line))
|
||
return records
|
||
|
||
|
||
def majority_vote(items: list) -> object | None:
|
||
if not items:
|
||
return None
|
||
c = Counter(items)
|
||
top, count = c.most_common(1)[0]
|
||
return top if count > len(items) / 2 else None
|
||
|
||
|
||
def plurality_vote(items: list) -> tuple:
|
||
c = Counter(items)
|
||
return c.most_common(1)[0]
|
||
|
||
|
||
def cohens_kappa(labels_a: list, labels_b: list) -> float:
|
||
"""Compute Cohen's kappa for two lists of categorical labels."""
|
||
assert len(labels_a) == len(labels_b)
|
||
n = len(labels_a)
|
||
if n == 0:
|
||
return 0.0
|
||
all_labels = sorted(set(labels_a) | set(labels_b))
|
||
idx = {l: i for i, l in enumerate(all_labels)}
|
||
k = len(all_labels)
|
||
conf = np.zeros((k, k))
|
||
for a, b in zip(labels_a, labels_b):
|
||
conf[idx[a]][idx[b]] += 1
|
||
po = np.trace(conf) / n
|
||
pe = sum((conf[i, :].sum() / n) * (conf[:, i].sum() / n) for i in range(k))
|
||
if pe >= 1.0:
|
||
return 1.0
|
||
return (po - pe) / (1 - pe)
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# LOAD ALL DATA
|
||
# ═══════════════════════════════════════════════════════════
|
||
print("Loading data...")
|
||
|
||
human_labels = load_jsonl(LABELS_PATH)
|
||
holdout_ids = {l["paragraphId"] for l in human_labels}
|
||
print(f" {len(human_labels)} human labels, {len(holdout_ids)} paragraphs")
|
||
|
||
# Stage 1 annotations for holdout
|
||
stage1_by_pid: dict[str, list[dict]] = defaultdict(list)
|
||
with open(STAGE1_PATH) as f:
|
||
for line in f:
|
||
d = json.loads(line)
|
||
if d["paragraphId"] in holdout_ids:
|
||
stage1_by_pid[d["paragraphId"]].append(d)
|
||
print(f" {sum(len(v) for v in stage1_by_pid.values())} Stage 1 annotations")
|
||
|
||
# Opus
|
||
opus_by_pid: dict[str, dict] = {}
|
||
for r in load_jsonl(OPUS_PATH):
|
||
if r["paragraphId"] in holdout_ids:
|
||
opus_by_pid[r["paragraphId"]] = r
|
||
print(f" {len(opus_by_pid)} Opus annotations matched to holdout")
|
||
|
||
# Benchmark models
|
||
bench_by_model: dict[str, dict[str, dict]] = {} # model_short -> {pid -> annotation}
|
||
bench_files = sorted(BENCH_DIR.glob("*.jsonl"))
|
||
for bf in bench_files:
|
||
if "errors" in bf.name:
|
||
continue
|
||
records = load_jsonl(bf)
|
||
if len(records) < 100:
|
||
continue # skip partial runs (deepseek-r1 has 1 annotation)
|
||
model_id = records[0]["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(model_id, model_id.split("/")[-1])
|
||
by_pid = {}
|
||
for r in records:
|
||
if r["paragraphId"] in holdout_ids:
|
||
by_pid[r["paragraphId"]] = r
|
||
bench_by_model[short] = by_pid
|
||
print(f" {short}: {len(by_pid)} annotations")
|
||
|
||
# Human labels grouped
|
||
human_by_pid: dict[str, list[dict]] = defaultdict(list)
|
||
for l in human_labels:
|
||
human_by_pid[l["paragraphId"]].append(l)
|
||
|
||
annotator_names = sorted({l["annotatorName"] for l in human_labels})
|
||
metrics = json.loads(METRICS_PATH.read_text())
|
||
|
||
# Paragraph metadata
|
||
para_all = load_jsonl(GOLD_DIR / "paragraphs-holdout.jsonl")
|
||
para_meta = {p["id"]: p for p in para_all if p["id"] in holdout_ids}
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# BUILD 13-SIGNAL MATRIX
|
||
# ═══════════════════════════════════════════════════════════
|
||
print("\nBuilding 13-signal matrix...")
|
||
|
||
# For each paragraph, collect all signals
|
||
# GenAI models: 3 Stage1 + Opus + 6 bench = 10
|
||
GENAI_SOURCES = ["Gemini Lite", "Grok Fast", "MIMO Flash", "Opus 4.6"] + sorted(bench_by_model.keys())
|
||
# Deduplicate (Opus might already be in bench)
|
||
GENAI_SOURCES = list(dict.fromkeys(GENAI_SOURCES))
|
||
ALL_GENAI = GENAI_SOURCES
|
||
|
||
# Model ID to short name mapping (reverse)
|
||
MODEL_ID_TO_SHORT = {v: k for k, v in MODEL_SHORT.items()}
|
||
|
||
signals = {} # pid -> {source_name: {cat, spec}}
|
||
for pid in holdout_ids:
|
||
sig = {}
|
||
|
||
# Human labels
|
||
for lbl in human_by_pid.get(pid, []):
|
||
sig[f"H:{lbl['annotatorName']}"] = {
|
||
"cat": lbl["contentCategory"],
|
||
"spec": lbl["specificityLevel"],
|
||
}
|
||
|
||
# Stage 1
|
||
for a in stage1_by_pid.get(pid, []):
|
||
mid = a["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
|
||
sig[short] = {"cat": a["label"]["content_category"], "spec": a["label"]["specificity_level"]}
|
||
|
||
# Opus
|
||
if pid in opus_by_pid:
|
||
sig["Opus 4.6"] = {
|
||
"cat": opus_by_pid[pid]["label"]["content_category"],
|
||
"spec": opus_by_pid[pid]["label"]["specificity_level"],
|
||
}
|
||
|
||
# Benchmark
|
||
for model_short, by_pid_map in bench_by_model.items():
|
||
if pid in by_pid_map:
|
||
a = by_pid_map[pid]
|
||
sig[model_short] = {"cat": a["label"]["content_category"], "spec": a["label"]["specificity_level"]}
|
||
|
||
signals[pid] = sig
|
||
|
||
# Derive consensus labels
|
||
consensus = {}
|
||
for pid in holdout_ids:
|
||
sig = signals[pid]
|
||
|
||
human_cats = [s["cat"] for k, s in sig.items() if k.startswith("H:")]
|
||
human_specs = [s["spec"] for k, s in sig.items() if k.startswith("H:")]
|
||
genai_cats = [s["cat"] for k, s in sig.items() if not k.startswith("H:")]
|
||
genai_specs = [s["spec"] for k, s in sig.items() if not k.startswith("H:")]
|
||
all_cats = human_cats + genai_cats
|
||
all_specs = human_specs + genai_specs
|
||
|
||
s1_cats = [s["cat"] for k, s in sig.items() if k in ("Gemini Lite", "Grok Fast", "MIMO Flash")]
|
||
s1_specs = [s["spec"] for k, s in sig.items() if k in ("Gemini Lite", "Grok Fast", "MIMO Flash")]
|
||
|
||
consensus[pid] = {
|
||
"human_cats": human_cats,
|
||
"human_specs": human_specs,
|
||
"human_cat_maj": majority_vote(human_cats),
|
||
"human_spec_maj": majority_vote([str(s) for s in human_specs]),
|
||
"human_cat_unanimous": len(set(human_cats)) == 1,
|
||
"human_spec_unanimous": len(set(human_specs)) == 1,
|
||
"s1_cats": s1_cats,
|
||
"s1_specs": s1_specs,
|
||
"s1_cat_maj": majority_vote(s1_cats),
|
||
"s1_spec_maj": majority_vote([str(s) for s in s1_specs]),
|
||
"genai_cats": genai_cats,
|
||
"genai_specs": genai_specs,
|
||
"genai_cat_maj": majority_vote(genai_cats),
|
||
"genai_spec_maj": majority_vote([str(s) for s in genai_specs]),
|
||
"all_cats": all_cats,
|
||
"all_specs": all_specs,
|
||
"all_cat_counts": Counter(all_cats),
|
||
"all_spec_counts": Counter(all_specs),
|
||
"n_signals": len(all_cats),
|
||
"opus_cat": sig.get("Opus 4.6", {}).get("cat"),
|
||
"opus_spec": sig.get("Opus 4.6", {}).get("spec"),
|
||
"word_count": para_meta.get(pid, {}).get("wordCount", 0),
|
||
"signals": sig,
|
||
}
|
||
# Fix human_spec_maj back to int
|
||
hsm = consensus[pid]["human_spec_maj"]
|
||
consensus[pid]["human_spec_maj"] = int(hsm) if hsm else None
|
||
ssm = consensus[pid]["s1_spec_maj"]
|
||
consensus[pid]["s1_spec_maj"] = int(ssm) if ssm else None
|
||
gsm = consensus[pid]["genai_spec_maj"]
|
||
consensus[pid]["genai_spec_maj"] = int(gsm) if gsm else None
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# ADJUDICATION TIERS
|
||
# ═══════════════════════════════════════════════════════════
|
||
tiers = {1: [], 2: [], 3: [], 4: []}
|
||
for pid, c in consensus.items():
|
||
n = c["n_signals"]
|
||
top_cat, top_cat_n = c["all_cat_counts"].most_common(1)[0]
|
||
top_spec, top_spec_n = Counter(c["all_specs"]).most_common(1)[0]
|
||
|
||
hm_cat = c["human_cat_maj"]
|
||
gm_cat = c["genai_cat_maj"]
|
||
hm_spec = c["human_spec_maj"]
|
||
gm_spec = c["genai_spec_maj"]
|
||
|
||
# Tier 1: 10+/13 agree on BOTH dimensions
|
||
if top_cat_n >= 10 and top_spec_n >= 10:
|
||
tiers[1].append(pid)
|
||
# Tier 2: human majority + genai majority agree on category
|
||
elif hm_cat and gm_cat and hm_cat == gm_cat:
|
||
tiers[2].append(pid)
|
||
# Tier 3: humans split, genai converges
|
||
elif hm_cat is None and gm_cat:
|
||
tiers[3].append(pid)
|
||
# Tier 4: everything else
|
||
else:
|
||
tiers[4].append(pid)
|
||
|
||
print(f"\nAdjudication tiers:")
|
||
for t in range(1, 5):
|
||
print(f" Tier {t}: {len(tiers[t])} paragraphs ({len(tiers[t])/12:.1f}%)")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 01: Pairwise Kappa Heatmaps (human annotators)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_kappa_heatmaps():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5.5))
|
||
for ax, dim_key, title in [
|
||
(ax1, "category", "Category"),
|
||
(ax2, "specificity", "Specificity"),
|
||
]:
|
||
data = metrics["pairwiseKappa"][dim_key]
|
||
names = data["annotators"]
|
||
matrix = np.array(data["matrix"])
|
||
mask = np.eye(len(names), dtype=bool)
|
||
display = np.where(mask, np.nan, matrix)
|
||
im = ax.imshow(display, cmap="RdYlGn", vmin=0, vmax=1, aspect="equal")
|
||
ax.set_xticks(range(len(names)))
|
||
ax.set_xticklabels(names, rotation=45, ha="right", fontsize=9)
|
||
ax.set_yticks(range(len(names)))
|
||
ax.set_yticklabels(names, fontsize=9)
|
||
ax.set_title(f"Pairwise Cohen's κ — {title}", fontsize=12, fontweight="bold")
|
||
for i in range(len(names)):
|
||
for j in range(len(names)):
|
||
if i != j:
|
||
color = "white" if matrix[i][j] < 0.4 else "black"
|
||
ax.text(j, i, f"{matrix[i][j]:.2f}", ha="center", va="center",
|
||
fontsize=8, color=color)
|
||
fig.colorbar(im, ax=[ax1, ax2], shrink=0.8, label="Cohen's κ")
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "01_kappa_heatmaps.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 01_kappa_heatmaps.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 02: Per-source category distribution (all 13 sources)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_all_source_category_dist():
|
||
fig, ax = plt.subplots(figsize=(18, 7))
|
||
|
||
sources = annotator_names + ["Human Maj", "S1 Maj"] + sorted(ALL_GENAI)
|
||
dist = {s: Counter() for s in sources}
|
||
|
||
for l in human_labels:
|
||
dist[l["annotatorName"]][l["contentCategory"]] += 1
|
||
for c in consensus.values():
|
||
if c["human_cat_maj"]:
|
||
dist["Human Maj"][c["human_cat_maj"]] += 1
|
||
if c["s1_cat_maj"]:
|
||
dist["S1 Maj"][c["s1_cat_maj"]] += 1
|
||
for pid, c in consensus.items():
|
||
for src, sig in c["signals"].items():
|
||
if not src.startswith("H:"):
|
||
dist[src][sig["cat"]] += 1
|
||
|
||
x = np.arange(len(sources))
|
||
width = 0.11
|
||
offsets = np.arange(len(CATEGORIES)) - len(CATEGORIES) / 2 + 0.5
|
||
colors = plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES)))
|
||
|
||
for i, (cat, color) in enumerate(zip(CATEGORIES, colors)):
|
||
counts = [dist[s].get(cat, 0) for s in sources]
|
||
totals = [sum(dist[s].values()) or 1 for s in sources]
|
||
pcts = [c / t * 100 for c, t in zip(counts, totals)]
|
||
ax.bar(x + offsets[i] * width, pcts, width, label=CAT_MAP[cat], color=color)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(sources, rotation=60, ha="right", fontsize=8)
|
||
ax.set_ylabel("% of labels")
|
||
ax.set_title("Category Distribution — All Sources (Humans + 10 GenAI Models)", fontweight="bold")
|
||
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
|
||
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "02_category_distribution_all.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 02_category_distribution_all.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 03: Per-source specificity distribution
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_all_source_spec_dist():
|
||
fig, ax = plt.subplots(figsize=(18, 6))
|
||
sources = annotator_names + ["Human Maj", "S1 Maj"] + sorted(ALL_GENAI)
|
||
dist = {s: Counter() for s in sources}
|
||
|
||
for l in human_labels:
|
||
dist[l["annotatorName"]][l["specificityLevel"]] += 1
|
||
for c in consensus.values():
|
||
hm = c["human_spec_maj"]
|
||
if hm is not None:
|
||
dist["Human Maj"][hm] += 1
|
||
sm = c["s1_spec_maj"]
|
||
if sm is not None:
|
||
dist["S1 Maj"][sm] += 1
|
||
for pid, c in consensus.items():
|
||
for src, sig in c["signals"].items():
|
||
if not src.startswith("H:"):
|
||
dist[src][sig["spec"]] += 1
|
||
|
||
x = np.arange(len(sources))
|
||
width = 0.18
|
||
colors = ["#e74c3c", "#f39c12", "#2ecc71", "#3498db"]
|
||
spec_labels = ["1 Generic", "2 Sector", "3 Firm-Specific", "4 Quantified"]
|
||
|
||
for i, (level, color, label) in enumerate(zip(SPEC_LEVELS, colors, spec_labels)):
|
||
counts = [dist[s].get(level, 0) for s in sources]
|
||
totals = [sum(dist[s].values()) or 1 for s in sources]
|
||
pcts = [c / t * 100 for c, t in zip(counts, totals)]
|
||
ax.bar(x + (i - 1.5) * width, pcts, width, label=label, color=color)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(sources, rotation=60, ha="right", fontsize=8)
|
||
ax.set_ylabel("% of labels")
|
||
ax.set_title("Specificity Distribution — All Sources", fontweight="bold")
|
||
ax.legend()
|
||
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "03_specificity_distribution_all.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 03_specificity_distribution_all.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 04: Human confusion matrices (category + specificity)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_human_confusion():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
||
|
||
cat_conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
|
||
for pid, lbls in human_by_pid.items():
|
||
cats = [l["contentCategory"] for l in lbls]
|
||
for i in range(len(cats)):
|
||
for j in range(i + 1, len(cats)):
|
||
a, b = CAT_IDX[cats[i]], CAT_IDX[cats[j]]
|
||
cat_conf[a][b] += 1
|
||
cat_conf[b][a] += 1
|
||
row_sums = cat_conf.sum(axis=1, keepdims=True)
|
||
cat_conf_norm = np.where(row_sums > 0, cat_conf / row_sums * 100, 0)
|
||
im1 = ax1.imshow(cat_conf_norm, cmap="YlOrRd", aspect="equal")
|
||
ax1.set_xticks(range(len(CAT_SHORT)))
|
||
ax1.set_xticklabels(CAT_SHORT, fontsize=9)
|
||
ax1.set_yticks(range(len(CAT_SHORT)))
|
||
ax1.set_yticklabels(CAT_SHORT, fontsize=9)
|
||
ax1.set_title("Human Category Confusion (row-norm %)", fontweight="bold")
|
||
for i in range(len(CAT_SHORT)):
|
||
for j in range(len(CAT_SHORT)):
|
||
val = cat_conf_norm[i][j]
|
||
if val > 0.5:
|
||
color = "white" if val > 40 else "black"
|
||
ax1.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=7, color=color)
|
||
|
||
spec_conf = np.zeros((4, 4))
|
||
for pid, lbls in human_by_pid.items():
|
||
specs = [l["specificityLevel"] for l in lbls]
|
||
for i in range(len(specs)):
|
||
for j in range(i + 1, len(specs)):
|
||
a, b = specs[i] - 1, specs[j] - 1
|
||
spec_conf[a][b] += 1
|
||
spec_conf[b][a] += 1
|
||
row_sums = spec_conf.sum(axis=1, keepdims=True)
|
||
spec_conf_norm = np.where(row_sums > 0, spec_conf / row_sums * 100, 0)
|
||
im2 = ax2.imshow(spec_conf_norm, cmap="YlOrRd", aspect="equal")
|
||
ax2.set_xticks(range(4))
|
||
ax2.set_xticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
|
||
ax2.set_yticks(range(4))
|
||
ax2.set_yticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
|
||
ax2.set_title("Human Specificity Confusion (row-norm %)", fontweight="bold")
|
||
for i in range(4):
|
||
for j in range(4):
|
||
val = spec_conf_norm[i][j]
|
||
if val > 0.5:
|
||
color = "white" if val > 40 else "black"
|
||
ax2.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=9, color=color)
|
||
fig.colorbar(im1, ax=ax1, shrink=0.8)
|
||
fig.colorbar(im2, ax=ax2, shrink=0.8)
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "04_human_confusion.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 04_human_confusion.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 05: GenAI Model Agreement Matrix (10×10 pairwise kappa)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_genai_agreement_matrix():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
|
||
|
||
models = sorted(ALL_GENAI)
|
||
n = len(models)
|
||
|
||
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "spec", "Specificity")]:
|
||
matrix = np.eye(n)
|
||
for i, m1 in enumerate(models):
|
||
for j, m2 in enumerate(models):
|
||
if i >= j:
|
||
continue
|
||
labels_a, labels_b = [], []
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if m1 in sig and m2 in sig:
|
||
labels_a.append(str(sig[m1][dim]))
|
||
labels_b.append(str(sig[m2][dim]))
|
||
if len(labels_a) >= 100:
|
||
k = cohens_kappa(labels_a, labels_b)
|
||
matrix[i][j] = k
|
||
matrix[j][i] = k
|
||
|
||
mask = np.eye(n, dtype=bool)
|
||
display = np.where(mask, np.nan, matrix)
|
||
im = ax.imshow(display, cmap="RdYlGn", vmin=0.2, vmax=1, aspect="equal")
|
||
ax.set_xticks(range(n))
|
||
ax.set_xticklabels(models, rotation=60, ha="right", fontsize=7)
|
||
ax.set_yticks(range(n))
|
||
ax.set_yticklabels(models, fontsize=7)
|
||
ax.set_title(f"GenAI Pairwise κ — {title}", fontweight="bold")
|
||
for i in range(n):
|
||
for j in range(n):
|
||
if i != j:
|
||
val = matrix[i][j]
|
||
color = "white" if val < 0.5 else "black"
|
||
ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=6, color=color)
|
||
|
||
fig.colorbar(im, ax=[ax1, ax2], shrink=0.7, label="Cohen's κ")
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "05_genai_agreement_matrix.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 05_genai_agreement_matrix.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 06: Cross-source confusion (Human vs Stage1, Human vs Opus, Human vs GenAI consensus)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_cross_source_confusion():
|
||
comparisons = [
|
||
("Human Maj", "S1 Maj", "human_cat_maj", "s1_cat_maj"),
|
||
("Human Maj", "Opus 4.6", "human_cat_maj", "opus_cat"),
|
||
("Human Maj", "GenAI Maj", "human_cat_maj", "genai_cat_maj"),
|
||
]
|
||
fig, axes = plt.subplots(1, 3, figsize=(21, 5.5))
|
||
|
||
for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
|
||
conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
|
||
total, agree = 0, 0
|
||
for pid, c in consensus.items():
|
||
a_val = c[key_a]
|
||
b_val = c[key_b]
|
||
if a_val and b_val:
|
||
conf[CAT_IDX[a_val]][CAT_IDX[b_val]] += 1
|
||
total += 1
|
||
if a_val == b_val:
|
||
agree += 1
|
||
row_sums = conf.sum(axis=1, keepdims=True)
|
||
conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)
|
||
im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
|
||
ax.set_xticks(range(len(CAT_SHORT)))
|
||
ax.set_xticklabels(CAT_SHORT, fontsize=8)
|
||
ax.set_yticks(range(len(CAT_SHORT)))
|
||
ax.set_yticklabels(CAT_SHORT, fontsize=8)
|
||
pct = agree / total * 100 if total > 0 else 0
|
||
ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})",
|
||
fontweight="bold", fontsize=10)
|
||
ax.set_ylabel(name_a)
|
||
ax.set_xlabel(name_b)
|
||
for i in range(len(CAT_SHORT)):
|
||
for j in range(len(CAT_SHORT)):
|
||
val = conf_norm[i][j]
|
||
if val > 0.5:
|
||
color = "white" if val > 50 else "black"
|
||
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=7, color=color)
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "06_cross_source_category.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 06_cross_source_category.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 07: Cross-source specificity confusion
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_cross_source_specificity():
|
||
comparisons = [
|
||
("Human Maj", "S1 Maj", "human_spec_maj", "s1_spec_maj"),
|
||
("Human Maj", "Opus", "human_spec_maj", "opus_spec"),
|
||
("Human Maj", "GenAI Maj", "human_spec_maj", "genai_spec_maj"),
|
||
]
|
||
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
|
||
|
||
for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
|
||
conf = np.zeros((4, 4))
|
||
total, agree = 0, 0
|
||
for pid, c in consensus.items():
|
||
a_val = c[key_a]
|
||
b_val = c[key_b]
|
||
if a_val is not None and b_val is not None:
|
||
conf[int(a_val) - 1][int(b_val) - 1] += 1
|
||
total += 1
|
||
if int(a_val) == int(b_val):
|
||
agree += 1
|
||
row_sums = conf.sum(axis=1, keepdims=True)
|
||
conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)
|
||
im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
|
||
ax.set_xticks(range(4))
|
||
ax.set_xticklabels(["S1", "S2", "S3", "S4"])
|
||
ax.set_yticks(range(4))
|
||
ax.set_yticklabels(["S1", "S2", "S3", "S4"])
|
||
pct = agree / total * 100 if total > 0 else 0
|
||
ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})", fontweight="bold")
|
||
ax.set_ylabel(name_a)
|
||
ax.set_xlabel(name_b)
|
||
for i in range(4):
|
||
for j in range(4):
|
||
val = conf_norm[i][j]
|
||
if val > 0.5:
|
||
color = "white" if val > 50 else "black"
|
||
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=9, color=color)
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "07_cross_source_specificity.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 07_cross_source_specificity.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 08: Adjudication tier breakdown
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_adjudication_tiers():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
||
|
||
# Tier counts
|
||
tier_sizes = [len(tiers[t]) for t in range(1, 5)]
|
||
tier_labels = [
|
||
f"Tier 1\n10+/13 agree\n(auto)",
|
||
f"Tier 2\nHuman+GenAI\nmaj agree",
|
||
f"Tier 3\nHumans split\nGenAI converges",
|
||
f"Tier 4\nUniversal\ndisagreement",
|
||
]
|
||
tier_colors = ["#27ae60", "#3498db", "#f39c12", "#e74c3c"]
|
||
bars = ax1.bar(range(4), tier_sizes, color=tier_colors)
|
||
ax1.set_xticks(range(4))
|
||
ax1.set_xticklabels(tier_labels, fontsize=8)
|
||
ax1.set_ylabel("Paragraphs")
|
||
ax1.set_title("Adjudication Tier Distribution (1,200 paragraphs)", fontweight="bold")
|
||
for bar, n in zip(bars, tier_sizes):
|
||
pct = n / 1200 * 100
|
||
ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
|
||
f"{n}\n({pct:.1f}%)", ha="center", fontsize=10, fontweight="bold")
|
||
|
||
# Per-tier category distribution
|
||
for t in range(1, 5):
|
||
cats = Counter()
|
||
for pid in tiers[t]:
|
||
top_cat = consensus[pid]["all_cat_counts"].most_common(1)[0][0]
|
||
cats[top_cat] += 1
|
||
pcts = [cats.get(c, 0) / len(tiers[t]) * 100 if tiers[t] else 0 for c in CATEGORIES]
|
||
x = np.arange(len(CATEGORIES))
|
||
ax2.barh([f"Tier {t}" for _ in CATEGORIES], pcts, left=[sum(pcts[:i]) for i in range(len(pcts))],
|
||
color=plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES))),
|
||
label=[CAT_MAP[c] if t == 1 else "" for c in CATEGORIES])
|
||
|
||
# Manually build the stacked bar properly
|
||
ax2.clear()
|
||
tier_names = [f"Tier {t}" for t in range(1, 5)]
|
||
bottom = np.zeros(4)
|
||
colors = plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES)))
|
||
for ci, cat in enumerate(CATEGORIES):
|
||
vals = []
|
||
for t in range(1, 5):
|
||
cat_count = sum(1 for pid in tiers[t]
|
||
if consensus[pid]["all_cat_counts"].most_common(1)[0][0] == cat)
|
||
vals.append(cat_count / len(tiers[t]) * 100 if tiers[t] else 0)
|
||
ax2.barh(tier_names, vals, left=bottom, color=colors[ci], label=CAT_MAP[cat])
|
||
bottom += np.array(vals)
|
||
|
||
ax2.set_xlabel("% of paragraphs in tier")
|
||
ax2.set_title("Category Mix by Adjudication Tier", fontweight="bold")
|
||
ax2.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
|
||
ax2.set_xlim(0, 105)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "08_adjudication_tiers.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 08_adjudication_tiers.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 09: Per-model accuracy vs Opus (as quasi-ground-truth)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_model_accuracy_vs_opus():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
|
||
|
||
models = sorted(ALL_GENAI)
|
||
cat_acc = []
|
||
spec_acc = []
|
||
model_labels = []
|
||
|
||
for model in models:
|
||
agree_cat, agree_spec, total = 0, 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
|
||
total += 1
|
||
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
|
||
agree_cat += 1
|
||
if sig[model]["spec"] == sig["Opus 4.6"]["spec"]:
|
||
agree_spec += 1
|
||
if total > 0:
|
||
cat_acc.append(agree_cat / total * 100)
|
||
spec_acc.append(agree_spec / total * 100)
|
||
model_labels.append(model)
|
||
|
||
# Sort by category accuracy
|
||
order = np.argsort(cat_acc)[::-1]
|
||
cat_acc = [cat_acc[i] for i in order]
|
||
spec_acc = [spec_acc[i] for i in order]
|
||
model_labels = [model_labels[i] for i in order]
|
||
|
||
tier_c = [TIER_COLORS.get(MODEL_TIER.get(
|
||
{v: k for k, v in MODEL_SHORT.items()}.get(m, ""), ""), "#999") for m in model_labels]
|
||
|
||
x = np.arange(len(model_labels))
|
||
width = 0.35
|
||
bars1 = ax1.barh(x, cat_acc, color=tier_c, edgecolor="black", linewidth=0.5)
|
||
ax1.set_yticks(x)
|
||
ax1.set_yticklabels(model_labels, fontsize=8)
|
||
ax1.set_xlabel("Agreement with Opus (%)")
|
||
ax1.set_title("Category Agreement with Opus 4.6", fontweight="bold")
|
||
ax1.set_xlim(60, 100)
|
||
ax1.invert_yaxis()
|
||
for bar, v in zip(bars1, cat_acc):
|
||
ax1.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
|
||
f"{v:.1f}%", va="center", fontsize=8)
|
||
|
||
bars2 = ax2.barh(x, spec_acc, color=tier_c, edgecolor="black", linewidth=0.5)
|
||
ax2.set_yticks(x)
|
||
ax2.set_yticklabels(model_labels, fontsize=8)
|
||
ax2.set_xlabel("Agreement with Opus (%)")
|
||
ax2.set_title("Specificity Agreement with Opus 4.6", fontweight="bold")
|
||
ax2.set_xlim(30, 100)
|
||
ax2.invert_yaxis()
|
||
for bar, v in zip(bars2, spec_acc):
|
||
ax2.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
|
||
f"{v:.1f}%", va="center", fontsize=8)
|
||
|
||
# Legend for tiers
|
||
from matplotlib.patches import Patch
|
||
legend_elements = [Patch(facecolor=c, label=t) for t, c in TIER_COLORS.items()]
|
||
ax1.legend(handles=legend_elements, loc="lower right", fontsize=8)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "09_model_accuracy_vs_opus.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 09_model_accuracy_vs_opus.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 10: Cost vs Accuracy scatter
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_cost_vs_accuracy():
|
||
fig, ax = plt.subplots(figsize=(12, 7))
|
||
|
||
# Gather cost and accuracy data per model
|
||
model_costs: dict[str, list[float]] = defaultdict(list)
|
||
model_lats: dict[str, list[float]] = defaultdict(list)
|
||
|
||
# From bench
|
||
for bf in bench_files:
|
||
if "errors" in bf.name:
|
||
continue
|
||
records = load_jsonl(bf)
|
||
if len(records) < 100:
|
||
continue
|
||
mid = records[0]["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
|
||
for r in records:
|
||
model_costs[short].append(r["provenance"].get("costUsd", 0))
|
||
model_lats[short].append(r["provenance"].get("latencyMs", 0))
|
||
|
||
# Stage 1 costs from annotations
|
||
for pid, annots in stage1_by_pid.items():
|
||
for a in annots:
|
||
mid = a["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
|
||
model_costs[short].append(a["provenance"].get("costUsd", 0))
|
||
model_lats[short].append(a["provenance"].get("latencyMs", 0))
|
||
|
||
# Opus
|
||
for r in opus_by_pid.values():
|
||
model_costs["Opus 4.6"].append(r["provenance"].get("costUsd", 0))
|
||
model_lats["Opus 4.6"].append(r["provenance"].get("latencyMs", 0))
|
||
|
||
for model in sorted(ALL_GENAI):
|
||
costs = model_costs.get(model, [])
|
||
if not costs:
|
||
continue
|
||
avg_cost = sum(costs) / len(costs)
|
||
avg_lat = sum(model_lats.get(model, [])) / max(len(model_lats.get(model, [])), 1) / 1000 # seconds
|
||
|
||
# Category accuracy vs Opus
|
||
agree, total = 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
|
||
total += 1
|
||
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
|
||
agree += 1
|
||
cat_acc = agree / total * 100 if total > 0 else 0
|
||
|
||
mid_full = {v: k for k, v in MODEL_SHORT.items()}.get(model, "")
|
||
tier = MODEL_TIER.get(mid_full, "mid")
|
||
color = TIER_COLORS.get(tier, "#999")
|
||
|
||
ax.scatter(avg_cost * 1000, cat_acc, s=150, c=color, edgecolors="black",
|
||
linewidths=0.5, zorder=3)
|
||
ax.annotate(model, (avg_cost * 1000, cat_acc),
|
||
textcoords="offset points", xytext=(8, 4), fontsize=7)
|
||
|
||
ax.set_xlabel("Average Cost per Call (millicents, $0.001)")
|
||
ax.set_ylabel("Category Agreement with Opus (%)")
|
||
ax.set_title("Cost vs Category Accuracy (Opus as reference)", fontweight="bold")
|
||
ax.set_ylim(60, 100)
|
||
|
||
from matplotlib.patches import Patch
|
||
legend_elements = [Patch(facecolor=c, label=t) for t, c in TIER_COLORS.items()]
|
||
ax.legend(handles=legend_elements, loc="lower right")
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "10_cost_vs_accuracy.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 10_cost_vs_accuracy.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 11: Per-category accuracy by model
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_per_category_accuracy():
|
||
fig, ax = plt.subplots(figsize=(16, 8))
|
||
|
||
models = sorted(ALL_GENAI)
|
||
# For each model, compute accuracy vs Opus per category
|
||
data = np.zeros((len(models), len(CATEGORIES)))
|
||
for mi, model in enumerate(models):
|
||
for ci, cat in enumerate(CATEGORIES):
|
||
agree, total = 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if "Opus 4.6" in sig and model in sig and model != "Opus 4.6":
|
||
if sig["Opus 4.6"]["cat"] == cat:
|
||
total += 1
|
||
if sig[model]["cat"] == cat:
|
||
agree += 1
|
||
data[mi][ci] = agree / total * 100 if total > 0 else 0
|
||
|
||
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=50, vmax=100)
|
||
ax.set_xticks(range(len(CAT_SHORT)))
|
||
ax.set_xticklabels(CAT_SHORT, fontsize=10)
|
||
ax.set_yticks(range(len(models)))
|
||
ax.set_yticklabels(models, fontsize=8)
|
||
ax.set_title("Per-Category Recall vs Opus (%) — Where each model excels/struggles", fontweight="bold")
|
||
ax.set_xlabel("Opus label (true category)")
|
||
|
||
for i in range(len(models)):
|
||
for j in range(len(CATEGORIES)):
|
||
val = data[i][j]
|
||
color = "white" if val < 65 else "black"
|
||
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=8, color=color)
|
||
|
||
fig.colorbar(im, ax=ax, shrink=0.6, label="Recall %")
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "11_per_category_accuracy.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 11_per_category_accuracy.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 12: Ensemble size vs accuracy (how many models needed?)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_ensemble_accuracy():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||
|
||
# For each ensemble size k (1..10), sample 100 random subsets of k GenAI models,
|
||
# take majority vote, compare to Opus
|
||
all_models = sorted(ALL_GENAI)
|
||
# Remove Opus itself from ensemble candidates
|
||
ensemble_candidates = [m for m in all_models if m != "Opus 4.6"]
|
||
|
||
rng = np.random.RandomState(42)
|
||
max_k = len(ensemble_candidates)
|
||
n_trials = 200
|
||
|
||
cat_accs_by_k = []
|
||
spec_accs_by_k = []
|
||
|
||
for k in range(1, max_k + 1):
|
||
cat_accs = []
|
||
spec_accs = []
|
||
subsets = []
|
||
if k >= max_k:
|
||
subsets = [ensemble_candidates]
|
||
else:
|
||
for _ in range(n_trials):
|
||
subsets.append(list(rng.choice(ensemble_candidates, k, replace=False)))
|
||
|
||
for subset in subsets:
|
||
agree_cat, agree_spec, total = 0, 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if "Opus 4.6" not in sig:
|
||
continue
|
||
sub_cats = [sig[m]["cat"] for m in subset if m in sig]
|
||
sub_specs = [sig[m]["spec"] for m in subset if m in sig]
|
||
if len(sub_cats) < k:
|
||
continue
|
||
total += 1
|
||
ens_cat = majority_vote(sub_cats)
|
||
ens_spec = majority_vote([str(s) for s in sub_specs])
|
||
if ens_cat == sig["Opus 4.6"]["cat"]:
|
||
agree_cat += 1
|
||
if ens_spec is not None and int(ens_spec) == sig["Opus 4.6"]["spec"]:
|
||
agree_spec += 1
|
||
if total > 0:
|
||
cat_accs.append(agree_cat / total * 100)
|
||
spec_accs.append(agree_spec / total * 100)
|
||
|
||
cat_accs_by_k.append(cat_accs)
|
||
spec_accs_by_k.append(spec_accs)
|
||
|
||
# Box plot
|
||
ks = range(1, max_k + 1)
|
||
ax1.boxplot(cat_accs_by_k, positions=list(ks), widths=0.6, patch_artist=True,
|
||
boxprops=dict(facecolor="#3498db", alpha=0.5),
|
||
medianprops=dict(color="red", linewidth=2))
|
||
ax1.set_xlabel("Ensemble size (# GenAI models)")
|
||
ax1.set_ylabel("Category agreement with Opus (%)")
|
||
ax1.set_title("Ensemble Size vs Category Accuracy", fontweight="bold")
|
||
ax1.set_xticks(list(ks))
|
||
ax1.set_xticklabels(list(ks))
|
||
|
||
ax2.boxplot(spec_accs_by_k, positions=list(ks), widths=0.6, patch_artist=True,
|
||
boxprops=dict(facecolor="#e74c3c", alpha=0.5),
|
||
medianprops=dict(color="red", linewidth=2))
|
||
ax2.set_xlabel("Ensemble size (# GenAI models)")
|
||
ax2.set_ylabel("Specificity agreement with Opus (%)")
|
||
ax2.set_title("Ensemble Size vs Specificity Accuracy", fontweight="bold")
|
||
ax2.set_xticks(list(ks))
|
||
ax2.set_xticklabels(list(ks))
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "12_ensemble_accuracy.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 12_ensemble_accuracy.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 13: Agreement by word count (human + genai)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_agreement_by_wordcount():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
wc_bins = [(0, 50), (51, 80), (81, 120), (121, 180), (181, 500)]
|
||
bin_labels = ["≤50", "51-80", "81-120", "121-180", "180+"]
|
||
|
||
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "both", "Both")]:
|
||
h_rates, g_rates, ns = [], [], []
|
||
for lo, hi in wc_bins:
|
||
h_agree, g_agree, total = 0, 0, 0
|
||
for pid, c in consensus.items():
|
||
wc = c["word_count"]
|
||
if lo <= wc <= hi:
|
||
total += 1
|
||
if dim == "cat":
|
||
if c["human_cat_unanimous"]:
|
||
h_agree += 1
|
||
if len(set(c["genai_cats"])) == 1:
|
||
g_agree += 1
|
||
else:
|
||
if c["human_cat_unanimous"] and c["human_spec_unanimous"]:
|
||
h_agree += 1
|
||
if len(set(c["genai_cats"])) == 1 and len(set(c["genai_specs"])) == 1:
|
||
g_agree += 1
|
||
h_rates.append(h_agree / total * 100 if total > 0 else 0)
|
||
g_rates.append(g_agree / total * 100 if total > 0 else 0)
|
||
ns.append(total)
|
||
|
||
x = np.arange(len(bin_labels))
|
||
width = 0.35
|
||
ax.bar(x - width / 2, h_rates, width, label="Human unanimous", color="#3498db")
|
||
ax.bar(x + width / 2, g_rates, width, label="GenAI unanimous", color="#e74c3c")
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(bin_labels)
|
||
ax.set_xlabel("Word Count")
|
||
ax.set_ylabel("Unanimous %")
|
||
ax.set_title(f"{title} Unanimity by Paragraph Length", fontweight="bold")
|
||
ax.legend()
|
||
for i, n in enumerate(ns):
|
||
ax.text(i, max(h_rates[i], g_rates[i]) + 1, f"n={n}", ha="center", fontsize=8)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "13_agreement_by_wordcount.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 13_agreement_by_wordcount.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 14: Time vs agreement
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_time_vs_agreement():
|
||
fig, ax = plt.subplots(figsize=(10, 5))
|
||
agreed_times, disagreed_times = [], []
|
||
for pid, lbls in human_by_pid.items():
|
||
times = [l.get("activeMs") or l.get("durationMs") for l in lbls]
|
||
times = [t for t in times if t is not None]
|
||
if not times:
|
||
continue
|
||
med_time = sorted(times)[len(times) // 2] / 1000
|
||
cats = [l["contentCategory"] for l in lbls]
|
||
if len(set(cats)) == 1:
|
||
agreed_times.append(med_time)
|
||
else:
|
||
disagreed_times.append(med_time)
|
||
|
||
bins = np.linspace(0, 120, 30)
|
||
ax.hist(agreed_times, bins=bins, alpha=0.6, label=f"Category agreed (n={len(agreed_times)})",
|
||
color="#2ecc71", density=True)
|
||
ax.hist(disagreed_times, bins=bins, alpha=0.6, label=f"Category disagreed (n={len(disagreed_times)})",
|
||
color="#e74c3c", density=True)
|
||
ax.set_xlabel("Median Active Time per Paragraph (seconds)")
|
||
ax.set_ylabel("Density")
|
||
ax.set_title("Labeling Time: Agreed vs Disagreed Paragraphs", fontweight="bold")
|
||
ax.legend()
|
||
ax.set_xlim(0, 120)
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "14_time_vs_agreement.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 14_time_vs_agreement.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 15: Outlier annotator deep-dive
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_outlier_annotator():
|
||
cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
|
||
ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
|
||
for pair in cat_kappas:
|
||
for a in ("a1", "a2"):
|
||
ann_kappa_sum[pair[a]]["sum"] += pair["kappa"]
|
||
ann_kappa_sum[pair[a]]["n"] += 1
|
||
outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])
|
||
name_to_id = {}
|
||
for l in human_labels:
|
||
name_to_id[l["annotatorName"]] = l["annotatorId"]
|
||
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
outlier_diverge_from = Counter()
|
||
outlier_diverge_to = Counter()
|
||
for pid, lbls in human_by_pid.items():
|
||
outlier_lbl = None
|
||
others = []
|
||
for l in lbls:
|
||
if l["annotatorName"] == outlier:
|
||
outlier_lbl = l
|
||
else:
|
||
others.append(l)
|
||
if outlier_lbl and len(others) >= 2:
|
||
other_cats = [o["contentCategory"] for o in others]
|
||
if other_cats[0] == other_cats[1] and other_cats[0] != outlier_lbl["contentCategory"]:
|
||
outlier_diverge_from[other_cats[0]] += 1
|
||
outlier_diverge_to[outlier_lbl["contentCategory"]] += 1
|
||
|
||
cats1 = sorted(outlier_diverge_from.keys(), key=lambda c: -outlier_diverge_from[c])
|
||
ax1.barh(range(len(cats1)), [outlier_diverge_from[c] for c in cats1], color="#e74c3c")
|
||
ax1.set_yticks(range(len(cats1)))
|
||
ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats1])
|
||
ax1.set_xlabel("Count")
|
||
ax1.set_title(f"{outlier}: what others agreed on", fontweight="bold")
|
||
ax1.invert_yaxis()
|
||
|
||
cats2 = sorted(outlier_diverge_to.keys(), key=lambda c: -outlier_diverge_to[c])
|
||
ax2.barh(range(len(cats2)), [outlier_diverge_to[c] for c in cats2], color="#f39c12")
|
||
ax2.set_yticks(range(len(cats2)))
|
||
ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats2])
|
||
ax2.set_xlabel("Count")
|
||
ax2.set_title(f"What {outlier} chose instead", fontweight="bold")
|
||
ax2.invert_yaxis()
|
||
|
||
avg_k = ann_kappa_sum[outlier]["sum"] / ann_kappa_sum[outlier]["n"]
|
||
fig.suptitle(f"Outlier Analysis: {outlier} (avg κ = {avg_k:.3f})", fontweight="bold")
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "15_outlier_annotator.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 15_outlier_annotator.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 16: With/without outlier consensus
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_with_without_outlier():
|
||
cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
|
||
ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
|
||
for pair in cat_kappas:
|
||
for a in ("a1", "a2"):
|
||
ann_kappa_sum[pair[a]]["sum"] += pair["kappa"]
|
||
ann_kappa_sum[pair[a]]["n"] += 1
|
||
outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])
|
||
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
|
||
|
||
n = 0
|
||
cat_w, cat_wo, spec_w, spec_wo, both_w, both_wo = 0, 0, 0, 0, 0, 0
|
||
for pid, lbls in human_by_pid.items():
|
||
names = [l["annotatorName"] for l in lbls]
|
||
if outlier not in names or len(lbls) < 3:
|
||
continue
|
||
n += 1
|
||
cats_all = [l["contentCategory"] for l in lbls]
|
||
specs_all = [l["specificityLevel"] for l in lbls]
|
||
cats_excl = [l["contentCategory"] for l in lbls if l["annotatorName"] != outlier]
|
||
specs_excl = [l["specificityLevel"] for l in lbls if l["annotatorName"] != outlier]
|
||
|
||
cat_u = len(set(cats_all)) == 1
|
||
cat_e = len(set(cats_excl)) == 1
|
||
spec_u = len(set(specs_all)) == 1
|
||
spec_e = len(set(specs_excl)) == 1
|
||
|
||
if cat_u: cat_w += 1
|
||
if cat_e: cat_wo += 1
|
||
if spec_u: spec_w += 1
|
||
if spec_e: spec_wo += 1
|
||
if cat_u and spec_u: both_w += 1
|
||
if cat_e and spec_e: both_wo += 1
|
||
|
||
labels_m = ["Category\nUnanimous", "Specificity\nUnanimous", "Both\nUnanimous"]
|
||
with_v = [cat_w / n * 100, spec_w / n * 100, both_w / n * 100]
|
||
without_v = [cat_wo / n * 100, spec_wo / n * 100, both_wo / n * 100]
|
||
|
||
x = np.arange(3)
|
||
width = 0.35
|
||
ax1.bar(x - width / 2, with_v, width, label="All 3", color="#e74c3c")
|
||
ax1.bar(x + width / 2, without_v, width, label=f"Excl. {outlier}", color="#2ecc71")
|
||
ax1.set_xticks(x)
|
||
ax1.set_xticklabels(labels_m)
|
||
ax1.set_ylabel("% of paragraphs")
|
||
ax1.set_title(f"Agreement on {outlier}'s paragraphs (n={n})", fontweight="bold")
|
||
ax1.legend()
|
||
for i, (w, wo) in enumerate(zip(with_v, without_v)):
|
||
ax1.text(i, max(w, wo) + 2, f"Δ={wo - w:+.1f}pp", ha="center", fontsize=9, fontweight="bold")
|
||
|
||
kappas_with = [p["kappa"] for p in cat_kappas]
|
||
kappas_without = [p["kappa"] for p in cat_kappas if outlier not in (p["a1"], p["a2"])]
|
||
bp = ax2.boxplot([kappas_with, kappas_without], positions=[1, 2], widths=0.5, patch_artist=True)
|
||
bp["boxes"][0].set_facecolor("#e74c3c")
|
||
bp["boxes"][0].set_alpha(0.5)
|
||
bp["boxes"][1].set_facecolor("#2ecc71")
|
||
bp["boxes"][1].set_alpha(0.5)
|
||
ax2.set_xticks([1, 2])
|
||
ax2.set_xticklabels(["All pairs", f"Excl. {outlier}"])
|
||
ax2.set_ylabel("Cohen's κ (category)")
|
||
ax2.set_title("Kappa Distribution", fontweight="bold")
|
||
rng = np.random.RandomState(42)
|
||
for pos, kappas in zip([1, 2], [kappas_with, kappas_without]):
|
||
jitter = rng.normal(0, 0.04, len(kappas))
|
||
ax2.scatter([pos + j for j in jitter], kappas, alpha=0.6, s=30, color="black", zorder=3)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "16_with_without_outlier.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 16_with_without_outlier.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 17: Disagreement axes — Human vs Stage1 vs All GenAI
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_disagreement_axes():
|
||
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
|
||
|
||
def compute_axes(cat_lists: list[list[str]]) -> Counter:
|
||
result = Counter()
|
||
for cats in cat_lists:
|
||
if len(set(cats)) >= 2:
|
||
for i, c1 in enumerate(cats):
|
||
for c2 in cats[i + 1:]:
|
||
if c1 != c2:
|
||
result[tuple(sorted([c1, c2]))] += 1
|
||
return result
|
||
|
||
human_axes = compute_axes([c["human_cats"] for c in consensus.values()])
|
||
s1_axes = compute_axes([c["s1_cats"] for c in consensus.values()])
|
||
genai_axes = compute_axes([c["genai_cats"] for c in consensus.values()])
|
||
|
||
for ax, data, title, color in [
|
||
(axes[0], human_axes, "Human", "#e74c3c"),
|
||
(axes[1], s1_axes, "Stage 1", "#3498db"),
|
||
(axes[2], genai_axes, "All GenAI (10)", "#2ecc71"),
|
||
]:
|
||
top = data.most_common(10)
|
||
labels = [f"{CAT_MAP[a]}↔{CAT_MAP[b]}" for (a, b), _ in top]
|
||
counts = [c for _, c in top]
|
||
ax.barh(range(len(labels)), counts, color=color)
|
||
ax.set_yticks(range(len(labels)))
|
||
ax.set_yticklabels(labels, fontsize=9)
|
||
ax.set_xlabel("Disagreement count")
|
||
ax.set_title(f"{title} Confusion Axes", fontweight="bold")
|
||
ax.invert_yaxis()
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "17_disagreement_axes.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 17_disagreement_axes.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 18: None/Other analysis
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_none_other_analysis():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
noneother_vs = Counter()
|
||
noneother_pids = set()
|
||
for pid, lbls in human_by_pid.items():
|
||
cats = [l["contentCategory"] for l in lbls]
|
||
if "None/Other" in cats and len(set(cats)) > 1:
|
||
noneother_pids.add(pid)
|
||
for c in cats:
|
||
if c != "None/Other":
|
||
noneother_vs[c] += 1
|
||
|
||
cats_sorted = sorted(noneother_vs.keys(), key=lambda c: -noneother_vs[c])
|
||
ax1.barh(range(len(cats_sorted)), [noneother_vs[c] for c in cats_sorted], color="#e74c3c")
|
||
ax1.set_yticks(range(len(cats_sorted)))
|
||
ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted])
|
||
ax1.set_xlabel("Count")
|
||
ax1.set_title(f"When someone says N/O, others say...\n({len(noneother_pids)} paragraphs)",
|
||
fontweight="bold")
|
||
ax1.invert_yaxis()
|
||
|
||
# What do GenAI models say for human-disagreed paragraphs?
|
||
genai_for_disagreed = Counter()
|
||
for pid, c in consensus.items():
|
||
if not c["human_cat_unanimous"] and c["genai_cat_maj"]:
|
||
genai_for_disagreed[c["genai_cat_maj"]] += 1
|
||
|
||
cats_sorted2 = sorted(genai_for_disagreed.keys(), key=lambda c: -genai_for_disagreed[c])
|
||
ax2.barh(range(len(cats_sorted2)), [genai_for_disagreed[c] for c in cats_sorted2], color="#3498db")
|
||
ax2.set_yticks(range(len(cats_sorted2)))
|
||
ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted2])
|
||
ax2.set_xlabel("Count")
|
||
ax2.set_title(f"GenAI majority for human-disagreed\n(n={sum(genai_for_disagreed.values())})",
|
||
fontweight="bold")
|
||
ax2.invert_yaxis()
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "18_none_other_analysis.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 18_none_other_analysis.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 19: Specificity bias per model vs Opus
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_specificity_bias_all():
|
||
fig, ax = plt.subplots(figsize=(16, 6))
|
||
|
||
sources = annotator_names + sorted(ALL_GENAI)
|
||
biases = []
|
||
for src in sources:
|
||
diffs = []
|
||
for pid, c in consensus.items():
|
||
if "Opus 4.6" not in c["signals"]:
|
||
continue
|
||
opus_spec = c["signals"]["Opus 4.6"]["spec"]
|
||
if src in annotator_names:
|
||
# Human
|
||
for l in human_by_pid.get(pid, []):
|
||
if l["annotatorName"] == src:
|
||
diffs.append(l["specificityLevel"] - opus_spec)
|
||
elif src in c["signals"] and src != "Opus 4.6":
|
||
diffs.append(c["signals"][src]["spec"] - opus_spec)
|
||
biases.append(np.mean(diffs) if diffs else 0)
|
||
|
||
colors = []
|
||
for i, (src, b) in enumerate(zip(sources, biases)):
|
||
if src in annotator_names:
|
||
colors.append("#9b59b6" if abs(b) > 0.5 else "#8e44ad")
|
||
else:
|
||
mid = {v: k for k, v in MODEL_SHORT.items()}.get(src, "")
|
||
colors.append(TIER_COLORS.get(MODEL_TIER.get(mid, "mid"), "#999"))
|
||
|
||
bars = ax.bar(range(len(sources)), biases, color=colors, edgecolor="black", linewidth=0.3)
|
||
ax.set_xticks(range(len(sources)))
|
||
ax.set_xticklabels(sources, rotation=60, ha="right", fontsize=7)
|
||
ax.set_ylabel("Mean (Source − Opus) Specificity")
|
||
ax.set_title("Specificity Bias vs Opus 4.6 (positive = over-rates specificity)", fontweight="bold")
|
||
ax.axhline(0, color="black", linewidth=1)
|
||
|
||
# Add a vertical line separating humans from models
|
||
ax.axvline(len(annotator_names) - 0.5, color="gray", linewidth=1, linestyle="--", alpha=0.5)
|
||
ax.text(len(annotator_names) / 2, ax.get_ylim()[1] * 0.9, "Humans", ha="center", fontsize=9, style="italic")
|
||
ax.text(len(annotator_names) + len(ALL_GENAI) / 2, ax.get_ylim()[1] * 0.9, "GenAI", ha="center", fontsize=9, style="italic")
|
||
|
||
for bar, b in zip(bars, biases):
|
||
if abs(b) > 0.05:
|
||
ax.text(bar.get_x() + bar.get_width() / 2,
|
||
bar.get_height() + (0.02 if b >= 0 else -0.06),
|
||
f"{b:+.2f}", ha="center", fontsize=7)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "19_specificity_bias_all.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 19_specificity_bias_all.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 20: Quiz vs quality
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_quiz_vs_quality():
|
||
fig, ax = plt.subplots(figsize=(10, 5))
|
||
quiz_sessions = load_jsonl(GOLD_DIR / "quiz-sessions.jsonl")
|
||
|
||
attempts: dict[str, int] = defaultdict(int)
|
||
for q in quiz_sessions:
|
||
attempts[q["annotatorName"]] += 1
|
||
|
||
ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
|
||
for l in human_labels:
|
||
ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l
|
||
|
||
opus_agree = {}
|
||
for name in annotator_names:
|
||
agree, total = 0, 0
|
||
for pid, lbl in ann_labels_by_name[name].items():
|
||
c = consensus.get(pid)
|
||
if c and c["opus_cat"]:
|
||
total += 1
|
||
if lbl["contentCategory"] == c["opus_cat"]:
|
||
agree += 1
|
||
opus_agree[name] = agree / total * 100 if total > 0 else 0
|
||
|
||
x = np.arange(len(annotator_names))
|
||
width = 0.35
|
||
ax.bar(x - width / 2, [attempts.get(n, 0) for n in annotator_names],
|
||
width, label="Quiz attempts", color="#f39c12")
|
||
ax2 = ax.twinx()
|
||
ax2.bar(x + width / 2, [opus_agree.get(n, 0) for n in annotator_names],
|
||
width, label="Cat agree w/ Opus (%)", color="#3498db", alpha=0.7)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(annotator_names, rotation=45, ha="right")
|
||
ax.set_ylabel("Quiz attempts", color="#f39c12")
|
||
ax2.set_ylabel("Opus agreement %", color="#3498db")
|
||
ax.set_title("Quiz Attempts vs Labeling Quality (Opus Agreement)", fontweight="bold")
|
||
|
||
lines1, labels1 = ax.get_legend_handles_labels()
|
||
lines2, labels2 = ax2.get_legend_handles_labels()
|
||
ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left")
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "20_quiz_vs_quality.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 20_quiz_vs_quality.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 21: Human vs GenAI consensus rates
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_human_vs_genai_consensus():
|
||
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
|
||
|
||
# Category unanimity
|
||
h_unan = sum(1 for c in consensus.values() if c["human_cat_unanimous"])
|
||
g_unan = sum(1 for c in consensus.values() if len(set(c["genai_cats"])) == 1)
|
||
b_unan = sum(1 for c in consensus.values() if c["human_cat_unanimous"] and len(set(c["genai_cats"])) == 1)
|
||
|
||
ax = axes[0]
|
||
vals = [h_unan, g_unan, b_unan]
|
||
pcts = [v / 1200 * 100 for v in vals]
|
||
labels = ["Human\n3/3", "GenAI\n10/10", "Both"]
|
||
bars = ax.bar(range(3), pcts, color=["#3498db", "#e74c3c", "#2ecc71"])
|
||
ax.set_xticks(range(3))
|
||
ax.set_xticklabels(labels)
|
||
ax.set_ylabel("%")
|
||
ax.set_title("Category Unanimity", fontweight="bold")
|
||
for bar, v, p in zip(bars, vals, pcts):
|
||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
|
||
f"{p:.1f}%\n({v})", ha="center", fontsize=9)
|
||
|
||
# Majority agreement
|
||
ax = axes[1]
|
||
cat_agree = sum(1 for c in consensus.values()
|
||
if c["human_cat_maj"] and c["genai_cat_maj"] and c["human_cat_maj"] == c["genai_cat_maj"])
|
||
cat_total = sum(1 for c in consensus.values() if c["human_cat_maj"] and c["genai_cat_maj"])
|
||
cat_diff = cat_total - cat_agree
|
||
|
||
bars = ax.bar(range(2), [cat_agree / cat_total * 100, cat_diff / cat_total * 100],
|
||
color=["#2ecc71", "#e74c3c"])
|
||
ax.set_xticks(range(2))
|
||
ax.set_xticklabels(["Agree", "Differ"])
|
||
ax.set_ylabel("%")
|
||
ax.set_title(f"Human Maj vs GenAI Maj — Category\n(n={cat_total})", fontweight="bold")
|
||
for bar, v in zip(bars, [cat_agree, cat_diff]):
|
||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
|
||
f"{v}", ha="center", fontsize=10)
|
||
|
||
# Specificity majority agreement
|
||
ax = axes[2]
|
||
spec_agree = sum(1 for c in consensus.values()
|
||
if c["human_spec_maj"] is not None and c["genai_spec_maj"] is not None
|
||
and c["human_spec_maj"] == c["genai_spec_maj"])
|
||
spec_total = sum(1 for c in consensus.values()
|
||
if c["human_spec_maj"] is not None and c["genai_spec_maj"] is not None)
|
||
spec_diff = spec_total - spec_agree
|
||
bars = ax.bar(range(2), [spec_agree / spec_total * 100, spec_diff / spec_total * 100],
|
||
color=["#2ecc71", "#e74c3c"])
|
||
ax.set_xticks(range(2))
|
||
ax.set_xticklabels(["Agree", "Differ"])
|
||
ax.set_ylabel("%")
|
||
ax.set_title(f"Human Maj vs GenAI Maj — Specificity\n(n={spec_total})", fontweight="bold")
|
||
for bar, v in zip(bars, [spec_agree, spec_diff]):
|
||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
|
||
f"{v}", ha="center", fontsize=10)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "21_human_vs_genai_consensus.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 21_human_vs_genai_consensus.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 22: Signal agreement distribution (how many of 13 agree?)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_signal_agreement_dist():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
cat_top_counts = []
|
||
spec_top_counts = []
|
||
for c in consensus.values():
|
||
cat_top_counts.append(c["all_cat_counts"].most_common(1)[0][1])
|
||
spec_top_counts.append(Counter(c["all_specs"]).most_common(1)[0][1])
|
||
|
||
ax1.hist(cat_top_counts, bins=range(1, 15), color="#3498db", edgecolor="black", alpha=0.7, align="left")
|
||
ax1.set_xlabel("# signals agreeing on top category")
|
||
ax1.set_ylabel("Paragraphs")
|
||
ax1.set_title("Category: Max Agreement Count per Paragraph", fontweight="bold")
|
||
ax1.axvline(10, color="red", linewidth=2, linestyle="--", label="Tier 1 threshold (10+)")
|
||
ax1.legend()
|
||
|
||
ax2.hist(spec_top_counts, bins=range(1, 15), color="#e74c3c", edgecolor="black", alpha=0.7, align="left")
|
||
ax2.set_xlabel("# signals agreeing on top specificity")
|
||
ax2.set_ylabel("Paragraphs")
|
||
ax2.set_title("Specificity: Max Agreement Count per Paragraph", fontweight="bold")
|
||
ax2.axvline(10, color="red", linewidth=2, linestyle="--", label="Tier 1 threshold (10+)")
|
||
ax2.legend()
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "22_signal_agreement_dist.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 22_signal_agreement_dist.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 23: Per-annotator agreement with all references
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_annotator_vs_references():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
|
||
for l in human_labels:
|
||
ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l
|
||
|
||
refs = [
|
||
("S1 Maj", "s1_cat_maj", "s1_spec_maj"),
|
||
("Opus", "opus_cat", "opus_spec"),
|
||
("GenAI Maj", "genai_cat_maj", "genai_spec_maj"),
|
||
]
|
||
|
||
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "spec", "Specificity")]:
|
||
x = np.arange(len(annotator_names))
|
||
width = 0.25
|
||
|
||
for ri, (ref_name, ref_cat, ref_spec) in enumerate(refs):
|
||
rates = []
|
||
for ann_name in annotator_names:
|
||
agree, total = 0, 0
|
||
for pid, lbl in ann_labels_by_name[ann_name].items():
|
||
c = consensus.get(pid)
|
||
if not c:
|
||
continue
|
||
ref_val = c[ref_cat] if dim == "cat" else c[ref_spec]
|
||
ann_val = lbl["contentCategory"] if dim == "cat" else lbl["specificityLevel"]
|
||
if ref_val is not None:
|
||
total += 1
|
||
if str(ann_val) == str(ref_val):
|
||
agree += 1
|
||
rates.append(agree / total * 100 if total > 0 else 0)
|
||
|
||
ax.bar(x + (ri - 1) * width, rates, width, label=ref_name)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(annotator_names, rotation=45, ha="right")
|
||
ax.set_ylabel("Agreement %")
|
||
ax.set_title(f"Per-Annotator {title} Agreement", fontweight="bold")
|
||
ax.legend()
|
||
ax.set_ylim(0, 100)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "23_annotator_vs_references.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 23_annotator_vs_references.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 24: "Hard paragraph" analysis — what makes Tier 4 different?
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_hard_paragraphs():
|
||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||
|
||
# Word count distribution by tier
|
||
ax = axes[0][0]
|
||
tier_wcs = {t: [consensus[pid]["word_count"] for pid in pids] for t, pids in tiers.items()}
|
||
data = [tier_wcs[t] for t in range(1, 5)]
|
||
bp = ax.boxplot(data, positions=range(1, 5), widths=0.6, patch_artist=True)
|
||
colors_t = ["#27ae60", "#3498db", "#f39c12", "#e74c3c"]
|
||
for patch, color in zip(bp["boxes"], colors_t):
|
||
patch.set_facecolor(color)
|
||
patch.set_alpha(0.5)
|
||
ax.set_xticklabels([f"Tier {t}" for t in range(1, 5)])
|
||
ax.set_ylabel("Word count")
|
||
ax.set_title("Paragraph Length by Tier", fontweight="bold")
|
||
|
||
# Category distribution by tier
|
||
ax = axes[0][1]
|
||
for t in range(1, 5):
|
||
cats = Counter()
|
||
for pid in tiers[t]:
|
||
top_cat = consensus[pid]["all_cat_counts"].most_common(1)[0][0]
|
||
cats[top_cat] += 1
|
||
pcts = [cats.get(c, 0) / len(tiers[t]) * 100 if tiers[t] else 0 for c in CATEGORIES]
|
||
ax.plot(range(len(CATEGORIES)), pcts, marker="o", label=f"Tier {t}", color=colors_t[t - 1])
|
||
ax.set_xticks(range(len(CAT_SHORT)))
|
||
ax.set_xticklabels(CAT_SHORT)
|
||
ax.set_ylabel("% of tier")
|
||
ax.set_title("Category Profile by Tier", fontweight="bold")
|
||
ax.legend()
|
||
|
||
# Specificity distribution by tier
|
||
ax = axes[1][0]
|
||
for t in range(1, 5):
|
||
specs = Counter()
|
||
for pid in tiers[t]:
|
||
top_spec = Counter(consensus[pid]["all_specs"]).most_common(1)[0][0]
|
||
specs[top_spec] += 1
|
||
pcts = [specs.get(s, 0) / len(tiers[t]) * 100 if tiers[t] else 0 for s in SPEC_LEVELS]
|
||
ax.plot(SPEC_LEVELS, pcts, marker="s", label=f"Tier {t}", color=colors_t[t - 1])
|
||
ax.set_xticks(SPEC_LEVELS)
|
||
ax.set_xticklabels(["S1", "S2", "S3", "S4"])
|
||
ax.set_ylabel("% of tier")
|
||
ax.set_title("Specificity Profile by Tier", fontweight="bold")
|
||
ax.legend()
|
||
|
||
# For Tier 4, what are the top confusion axes?
|
||
ax = axes[1][1]
|
||
t4_axes = Counter()
|
||
for pid in tiers[4]:
|
||
cats = consensus[pid]["all_cats"]
|
||
unique = set(cats)
|
||
if len(unique) >= 2:
|
||
for a, b in combinations(unique, 2):
|
||
t4_axes[tuple(sorted([a, b]))] += 1
|
||
top = t4_axes.most_common(8)
|
||
if top:
|
||
labels = [f"{CAT_MAP[a]}↔{CAT_MAP[b]}" for (a, b), _ in top]
|
||
counts = [c for _, c in top]
|
||
ax.barh(range(len(labels)), counts, color="#e74c3c")
|
||
ax.set_yticks(range(len(labels)))
|
||
ax.set_yticklabels(labels)
|
||
ax.set_xlabel("Count")
|
||
ax.set_title(f"Tier 4 Confusion Axes (n={len(tiers[4])})", fontweight="bold")
|
||
ax.invert_yaxis()
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "24_hard_paragraphs.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 24_hard_paragraphs.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 25: Model agreement with human majority (per category)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_model_vs_human_per_category():
|
||
fig, ax = plt.subplots(figsize=(16, 8))
|
||
|
||
models = sorted(ALL_GENAI)
|
||
data = np.zeros((len(models), len(CATEGORIES)))
|
||
for mi, model in enumerate(models):
|
||
for ci, cat in enumerate(CATEGORIES):
|
||
agree, total = 0, 0
|
||
for pid, c in consensus.items():
|
||
if c["human_cat_maj"] != cat:
|
||
continue
|
||
sig = c["signals"]
|
||
if model in sig:
|
||
total += 1
|
||
if sig[model]["cat"] == cat:
|
||
agree += 1
|
||
data[mi][ci] = agree / total * 100 if total > 0 else 0
|
||
|
||
im = ax.imshow(data, cmap="RdYlGn", aspect="auto", vmin=40, vmax=100)
|
||
ax.set_xticks(range(len(CAT_SHORT)))
|
||
ax.set_xticklabels(CAT_SHORT, fontsize=10)
|
||
ax.set_yticks(range(len(models)))
|
||
ax.set_yticklabels(models, fontsize=8)
|
||
ax.set_title("Per-Category Recall vs Human Majority (%)", fontweight="bold")
|
||
ax.set_xlabel("Human majority label")
|
||
|
||
for i in range(len(models)):
|
||
for j in range(len(CATEGORIES)):
|
||
val = data[i][j]
|
||
color = "white" if val < 60 else "black"
|
||
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=8, color=color)
|
||
|
||
fig.colorbar(im, ax=ax, shrink=0.6, label="Recall %")
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "25_model_vs_human_per_category.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 25_model_vs_human_per_category.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 26: Prompt version effect (v2.5 Stage1 vs v3.0 bench)
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_prompt_version_effect():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||
|
||
# Compare Stage 1 (v2.5) vs benchmark (v3.0) agreement with Opus
|
||
v25_models = ["Gemini Lite", "Grok Fast", "MIMO Flash"]
|
||
v30_models = [m for m in bench_by_model.keys() if m != "Opus 4.6"]
|
||
|
||
# Category agreement with Opus per model
|
||
model_acc = {}
|
||
for model in v25_models + list(v30_models):
|
||
agree, total = 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
|
||
total += 1
|
||
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
|
||
agree += 1
|
||
model_acc[model] = agree / total * 100 if total > 0 else 0
|
||
|
||
v25_accs = [model_acc[m] for m in v25_models]
|
||
v30_accs = [model_acc[m] for m in v30_models]
|
||
|
||
ax1.boxplot([v25_accs, v30_accs], labels=["v2.5 (Stage 1)", "v3.0 (Bench)"],
|
||
patch_artist=True,
|
||
boxprops=dict(alpha=0.5))
|
||
for pos, accs in zip([1, 2], [v25_accs, v30_accs]):
|
||
ax1.scatter([pos] * len(accs), accs, s=80, zorder=3, edgecolors="black")
|
||
for acc, m in zip(accs, v25_models if pos == 1 else list(v30_models)):
|
||
ax1.annotate(m, (pos, acc), textcoords="offset points", xytext=(8, 0), fontsize=7)
|
||
ax1.set_ylabel("Category Agreement with Opus (%)")
|
||
ax1.set_title("Prompt Version Effect on Category Accuracy", fontweight="bold")
|
||
|
||
# Confusion on the 3 codebook rulings axes
|
||
# MR↔RMP, N/O↔SI, N/O (SPAC/materiality)
|
||
ruling_axes = [
|
||
("MR↔RMP", "Management Role", "Risk Management Process"),
|
||
("N/O↔SI", "None/Other", "Strategy Integration"),
|
||
("BG↔MR", "Board Governance", "Management Role"),
|
||
]
|
||
|
||
x = np.arange(len(ruling_axes))
|
||
width = 0.3
|
||
|
||
for gi, (group_models, label, color) in enumerate([
|
||
(v25_models, "v2.5", "#e74c3c"),
|
||
(list(v30_models), "v3.0", "#3498db"),
|
||
]):
|
||
confusion_rates = []
|
||
for axis_label, cat_a, cat_b in ruling_axes:
|
||
confuse, total_relevant = 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
opus_cat = c.get("opus_cat")
|
||
if opus_cat not in (cat_a, cat_b):
|
||
continue
|
||
for m in group_models:
|
||
if m in sig:
|
||
total_relevant += 1
|
||
if sig[m]["cat"] in (cat_a, cat_b) and sig[m]["cat"] != opus_cat:
|
||
confuse += 1
|
||
confusion_rates.append(confuse / total_relevant * 100 if total_relevant > 0 else 0)
|
||
ax2.bar(x + (gi - 0.5) * width, confusion_rates, width, label=label, color=color)
|
||
|
||
ax2.set_xticks(x)
|
||
ax2.set_xticklabels([a[0] for a in ruling_axes])
|
||
ax2.set_ylabel("Confusion rate (%)")
|
||
ax2.set_title("Codebook Ruling Axes: v2.5 vs v3.0 Confusion", fontweight="bold")
|
||
ax2.legend()
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "26_prompt_version_effect.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 26_prompt_version_effect.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 27: Human-GenAI agreement conditioned on difficulty
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_conditional_agreement():
|
||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||
|
||
# When humans are unanimous, how often does GenAI agree?
|
||
# When GenAI is unanimous, how often do humans agree?
|
||
categories_for_analysis = CATEGORIES
|
||
|
||
# Human unanimous → GenAI agreement rate per category
|
||
h_unan_g_agree = {c: [0, 0] for c in categories_for_analysis} # [agree, total]
|
||
g_unan_h_agree = {c: [0, 0] for c in categories_for_analysis}
|
||
|
||
for pid, c in consensus.items():
|
||
hm = c["human_cat_maj"]
|
||
gm = c["genai_cat_maj"]
|
||
|
||
if c["human_cat_unanimous"] and hm:
|
||
h_unan_g_agree[hm][1] += 1
|
||
if gm == hm:
|
||
h_unan_g_agree[hm][0] += 1
|
||
|
||
if len(set(c["genai_cats"])) == 1 and gm:
|
||
g_unan_h_agree[gm][1] += 1
|
||
if hm == gm:
|
||
g_unan_h_agree[gm][0] += 1
|
||
|
||
cats = CATEGORIES
|
||
h_rates = [h_unan_g_agree[c][0] / h_unan_g_agree[c][1] * 100
|
||
if h_unan_g_agree[c][1] > 0 else 0 for c in cats]
|
||
g_rates = [g_unan_h_agree[c][0] / g_unan_h_agree[c][1] * 100
|
||
if g_unan_h_agree[c][1] > 0 else 0 for c in cats]
|
||
|
||
x = np.arange(len(cats))
|
||
ax1.bar(x, h_rates, color="#3498db")
|
||
ax1.set_xticks(x)
|
||
ax1.set_xticklabels(CAT_SHORT)
|
||
ax1.set_ylabel("GenAI majority agrees (%)")
|
||
ax1.set_title("When Humans are Unanimous → GenAI agreement", fontweight="bold")
|
||
ax1.set_ylim(0, 105)
|
||
for i, (rate, c) in enumerate(zip(h_rates, cats)):
|
||
n = h_unan_g_agree[c][1]
|
||
if n > 0:
|
||
ax1.text(i, rate + 1, f"{rate:.0f}%\nn={n}", ha="center", fontsize=8)
|
||
|
||
ax2.bar(x, g_rates, color="#e74c3c")
|
||
ax2.set_xticks(x)
|
||
ax2.set_xticklabels(CAT_SHORT)
|
||
ax2.set_ylabel("Human majority agrees (%)")
|
||
ax2.set_title("When GenAI is Unanimous → Human agreement", fontweight="bold")
|
||
ax2.set_ylim(0, 105)
|
||
for i, (rate, c) in enumerate(zip(g_rates, cats)):
|
||
n = g_unan_h_agree[c][1]
|
||
if n > 0:
|
||
ax2.text(i, rate + 1, f"{rate:.0f}%\nn={n}", ha="center", fontsize=8)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "27_conditional_agreement.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 27_conditional_agreement.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 28: Model clustering — which models agree with which?
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_model_clustering():
|
||
fig, ax = plt.subplots(figsize=(12, 8))
|
||
|
||
models = sorted(ALL_GENAI)
|
||
n = len(models)
|
||
|
||
# Compute pairwise agreement rate (simpler than kappa, more intuitive)
|
||
agree_matrix = np.zeros((n, n))
|
||
for i, m1 in enumerate(models):
|
||
for j, m2 in enumerate(models):
|
||
if i == j:
|
||
agree_matrix[i][j] = 100
|
||
continue
|
||
agree, total = 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if m1 in sig and m2 in sig:
|
||
total += 1
|
||
if sig[m1]["cat"] == sig[m2]["cat"]:
|
||
agree += 1
|
||
agree_matrix[i][j] = agree / total * 100 if total > 0 else 0
|
||
|
||
# Hierarchical clustering via simple greedy reordering
|
||
# Use 1 - agreement as distance
|
||
dist = 100 - agree_matrix
|
||
# Simple nearest-neighbor chain ordering
|
||
remaining = list(range(n))
|
||
order = [remaining.pop(0)]
|
||
while remaining:
|
||
last = order[-1]
|
||
nearest = min(remaining, key=lambda x: dist[last][x])
|
||
remaining.remove(nearest)
|
||
order.append(nearest)
|
||
|
||
reordered = agree_matrix[np.ix_(order, order)]
|
||
reordered_labels = [models[i] for i in order]
|
||
|
||
im = ax.imshow(reordered, cmap="YlGnBu", vmin=60, vmax=100, aspect="equal")
|
||
ax.set_xticks(range(n))
|
||
ax.set_xticklabels(reordered_labels, rotation=60, ha="right", fontsize=8)
|
||
ax.set_yticks(range(n))
|
||
ax.set_yticklabels(reordered_labels, fontsize=8)
|
||
ax.set_title("Model Pairwise Category Agreement % (clustered)", fontweight="bold")
|
||
|
||
for i in range(n):
|
||
for j in range(n):
|
||
if i != j:
|
||
val = reordered[i][j]
|
||
color = "white" if val < 75 else "black"
|
||
ax.text(j, i, f"{val:.0f}", ha="center", va="center", fontsize=7, color=color)
|
||
|
||
fig.colorbar(im, ax=ax, shrink=0.7, label="Agreement %")
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "28_model_clustering.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 28_model_clustering.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 29: Specificity calibration — per-model spec distribution conditioned on Opus spec
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_spec_calibration():
|
||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||
|
||
models_to_show = ["GPT-5.4", "Gemini Pro", "Kimi K2.5", "MIMO Flash"]
|
||
models_to_show = [m for m in models_to_show if m in ALL_GENAI]
|
||
|
||
for ax, model in zip(axes.flat, models_to_show):
|
||
# For each Opus spec level, what does this model predict?
|
||
data = np.zeros((4, 4))
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if "Opus 4.6" in sig and model in sig:
|
||
opus_s = sig["Opus 4.6"]["spec"]
|
||
model_s = sig[model]["spec"]
|
||
data[opus_s - 1][model_s - 1] += 1
|
||
|
||
# Normalize rows
|
||
row_sums = data.sum(axis=1, keepdims=True)
|
||
data_norm = np.where(row_sums > 0, data / row_sums * 100, 0)
|
||
|
||
im = ax.imshow(data_norm, cmap="YlGnBu", aspect="equal", vmin=0, vmax=100)
|
||
ax.set_xticks(range(4))
|
||
ax.set_xticklabels(["S1", "S2", "S3", "S4"])
|
||
ax.set_yticks(range(4))
|
||
ax.set_yticklabels(["S1", "S2", "S3", "S4"])
|
||
ax.set_xlabel(f"{model} prediction")
|
||
ax.set_ylabel("Opus label")
|
||
ax.set_title(f"{model} Specificity Calibration", fontweight="bold")
|
||
|
||
for i in range(4):
|
||
for j in range(4):
|
||
val = data_norm[i][j]
|
||
n = int(data[i][j])
|
||
color = "white" if val > 60 else "black"
|
||
ax.text(j, i, f"{val:.0f}%\n({n})", ha="center", va="center", fontsize=8, color=color)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "29_spec_calibration.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 29_spec_calibration.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CHART 30: Latency vs accuracy
|
||
# ═══════════════════════════════════════════════════════════
|
||
def plot_latency_vs_accuracy():
|
||
fig, ax = plt.subplots(figsize=(12, 7))
|
||
|
||
model_lats: dict[str, list[float]] = defaultdict(list)
|
||
for bf in bench_files:
|
||
if "errors" in bf.name:
|
||
continue
|
||
records = load_jsonl(bf)
|
||
if len(records) < 100:
|
||
continue
|
||
mid = records[0]["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
|
||
for r in records:
|
||
model_lats[short].append(r["provenance"].get("latencyMs", 0))
|
||
for pid, annots in stage1_by_pid.items():
|
||
for a in annots:
|
||
mid = a["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
|
||
model_lats[short].append(a["provenance"].get("latencyMs", 0))
|
||
for r in opus_by_pid.values():
|
||
model_lats["Opus 4.6"].append(r["provenance"].get("latencyMs", 0))
|
||
|
||
for model in sorted(ALL_GENAI):
|
||
lats = model_lats.get(model, [])
|
||
if not lats:
|
||
continue
|
||
avg_lat = sum(lats) / len(lats) / 1000
|
||
|
||
agree, total = 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if model in sig and "Opus 4.6" in sig and model != "Opus 4.6":
|
||
total += 1
|
||
if sig[model]["cat"] == sig["Opus 4.6"]["cat"]:
|
||
agree += 1
|
||
cat_acc = agree / total * 100 if total > 0 else 0
|
||
|
||
mid_full = {v: k for k, v in MODEL_SHORT.items()}.get(model, "")
|
||
tier = MODEL_TIER.get(mid_full, "mid")
|
||
color = TIER_COLORS.get(tier, "#999")
|
||
|
||
ax.scatter(avg_lat, cat_acc, s=150, c=color, edgecolors="black", linewidths=0.5, zorder=3)
|
||
ax.annotate(model, (avg_lat, cat_acc), textcoords="offset points", xytext=(8, 4), fontsize=7)
|
||
|
||
ax.set_xlabel("Average Latency (seconds)")
|
||
ax.set_ylabel("Category Agreement with Opus (%)")
|
||
ax.set_title("Latency vs Category Accuracy", fontweight="bold")
|
||
ax.set_ylim(60, 100)
|
||
|
||
from matplotlib.patches import Patch
|
||
legend_elements = [Patch(facecolor=c, label=t) for t, c in TIER_COLORS.items()]
|
||
ax.legend(handles=legend_elements, loc="lower left")
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(CHART_DIR / "30_latency_vs_accuracy.png", dpi=150)
|
||
plt.close(fig)
|
||
print(" 30_latency_vs_accuracy.png")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# TEXTUAL ANALYSIS
|
||
# ═══════════════════════════════════════════════════════════
|
||
def print_full_analysis():
|
||
print("\n" + "=" * 80)
|
||
print("COMPREHENSIVE 13-SIGNAL ANALYSIS")
|
||
print("=" * 80)
|
||
|
||
# ── Summary stats ──
|
||
print(f"\n{'─' * 60}")
|
||
print("SIGNAL COVERAGE")
|
||
print(f"{'─' * 60}")
|
||
signal_counts = [c["n_signals"] for c in consensus.values()]
|
||
print(f" Paragraphs: {len(consensus)}")
|
||
print(f" Min/Max/Mean signals per paragraph: {min(signal_counts)}/{max(signal_counts)}/{np.mean(signal_counts):.1f}")
|
||
print(f" GenAI models: {len(ALL_GENAI)}")
|
||
print(f" Human annotators: {len(annotator_names)}")
|
||
|
||
# ── Adjudication ──
|
||
print(f"\n{'─' * 60}")
|
||
print("ADJUDICATION TIERS")
|
||
print(f"{'─' * 60}")
|
||
for t in range(1, 5):
|
||
pct = len(tiers[t]) / 1200 * 100
|
||
print(f" Tier {t}: {len(tiers[t]):4d} ({pct:5.1f}%)")
|
||
|
||
# What are the dominant categories in Tier 1 vs Tier 4?
|
||
for t in [1, 4]:
|
||
cats = Counter()
|
||
for pid in tiers[t]:
|
||
cats[consensus[pid]["all_cat_counts"].most_common(1)[0][0]] += 1
|
||
print(f"\n Tier {t} category breakdown:")
|
||
for cat, n in cats.most_common():
|
||
print(f" {CAT_MAP[cat]}: {n} ({n/len(tiers[t])*100:.1f}%)")
|
||
|
||
# ── Cross-source agreement ──
|
||
print(f"\n{'─' * 60}")
|
||
print("CROSS-SOURCE AGREEMENT — CATEGORY")
|
||
print(f"{'─' * 60}")
|
||
|
||
h_eq_s1 = sum(1 for c in consensus.values()
|
||
if c["human_cat_maj"] and c["s1_cat_maj"] and c["human_cat_maj"] == c["s1_cat_maj"])
|
||
h_eq_op = sum(1 for c in consensus.values()
|
||
if c["human_cat_maj"] and c["opus_cat"] and c["human_cat_maj"] == c["opus_cat"])
|
||
h_eq_g = sum(1 for c in consensus.values()
|
||
if c["human_cat_maj"] and c["genai_cat_maj"] and c["human_cat_maj"] == c["genai_cat_maj"])
|
||
s1_eq_op = sum(1 for c in consensus.values()
|
||
if c["s1_cat_maj"] and c["opus_cat"] and c["s1_cat_maj"] == c["opus_cat"])
|
||
g_eq_op = sum(1 for c in consensus.values()
|
||
if c["genai_cat_maj"] and c["opus_cat"] and c["genai_cat_maj"] == c["opus_cat"])
|
||
|
||
n_hmaj = sum(1 for c in consensus.values() if c["human_cat_maj"])
|
||
n_opus = sum(1 for c in consensus.values() if c["opus_cat"])
|
||
|
||
print(f" Human maj = S1 maj: {h_eq_s1}/{n_hmaj} ({h_eq_s1/n_hmaj*100:.1f}%)")
|
||
print(f" Human maj = Opus: {h_eq_op}/{n_opus} ({h_eq_op/n_opus*100:.1f}%)")
|
||
print(f" Human maj = GenAI maj: {h_eq_g}/{n_hmaj} ({h_eq_g/n_hmaj*100:.1f}%)")
|
||
print(f" S1 maj = Opus: {s1_eq_op}/{n_opus} ({s1_eq_op/n_opus*100:.1f}%)")
|
||
print(f" GenAI maj = Opus: {g_eq_op}/{n_opus} ({g_eq_op/n_opus*100:.1f}%)")
|
||
|
||
# ── Cross-source agreement: specificity ──
|
||
print(f"\n{'─' * 60}")
|
||
print("CROSS-SOURCE AGREEMENT — SPECIFICITY")
|
||
print(f"{'─' * 60}")
|
||
|
||
h_eq_s1_s = sum(1 for c in consensus.values()
|
||
if c["human_spec_maj"] is not None and c["s1_spec_maj"] is not None
|
||
and c["human_spec_maj"] == c["s1_spec_maj"])
|
||
h_eq_op_s = sum(1 for c in consensus.values()
|
||
if c["human_spec_maj"] is not None and c["opus_spec"] is not None
|
||
and c["human_spec_maj"] == c["opus_spec"])
|
||
h_eq_g_s = sum(1 for c in consensus.values()
|
||
if c["human_spec_maj"] is not None and c["genai_spec_maj"] is not None
|
||
and c["human_spec_maj"] == c["genai_spec_maj"])
|
||
|
||
n_hs = sum(1 for c in consensus.values() if c["human_spec_maj"] is not None)
|
||
print(f" Human maj = S1 maj: {h_eq_s1_s}/{n_hs} ({h_eq_s1_s/n_hs*100:.1f}%)")
|
||
print(f" Human maj = Opus: {h_eq_op_s}/{n_hs} ({h_eq_op_s/n_hs*100:.1f}%)")
|
||
print(f" Human maj = GenAI maj: {h_eq_g_s}/{n_hs} ({h_eq_g_s/n_hs*100:.1f}%)")
|
||
|
||
# ── Per-model accuracy ──
|
||
print(f"\n{'─' * 60}")
|
||
print("PER-MODEL ACCURACY vs OPUS (category / specificity / both)")
|
||
print(f"{'─' * 60}")
|
||
model_stats = []
|
||
for model in sorted(ALL_GENAI):
|
||
if model == "Opus 4.6":
|
||
continue
|
||
agree_c, agree_s, agree_b, total = 0, 0, 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
if model in sig and "Opus 4.6" in sig:
|
||
total += 1
|
||
cat_match = sig[model]["cat"] == sig["Opus 4.6"]["cat"]
|
||
spec_match = sig[model]["spec"] == sig["Opus 4.6"]["spec"]
|
||
if cat_match:
|
||
agree_c += 1
|
||
if spec_match:
|
||
agree_s += 1
|
||
if cat_match and spec_match:
|
||
agree_b += 1
|
||
if total > 0:
|
||
model_stats.append((model, agree_c / total * 100, agree_s / total * 100, agree_b / total * 100, total))
|
||
model_stats.sort(key=lambda x: -x[3]) # sort by both
|
||
for model, cat, spec, both, n in model_stats:
|
||
print(f" {model:20s} cat={cat:5.1f}% spec={spec:5.1f}% both={both:5.1f}% (n={n})")
|
||
|
||
# ── Per-model accuracy vs HUMAN majority ──
|
||
print(f"\n{'─' * 60}")
|
||
print("PER-MODEL ACCURACY vs HUMAN MAJORITY (category / specificity / both)")
|
||
print(f"{'─' * 60}")
|
||
model_stats_h = []
|
||
for model in sorted(ALL_GENAI):
|
||
agree_c, agree_s, agree_b, total = 0, 0, 0, 0
|
||
for pid, c in consensus.items():
|
||
sig = c["signals"]
|
||
hm_c = c["human_cat_maj"]
|
||
hm_s = c["human_spec_maj"]
|
||
if model in sig and hm_c:
|
||
total += 1
|
||
cat_match = sig[model]["cat"] == hm_c
|
||
spec_match = hm_s is not None and sig[model]["spec"] == hm_s
|
||
if cat_match:
|
||
agree_c += 1
|
||
if spec_match:
|
||
agree_s += 1
|
||
if cat_match and spec_match:
|
||
agree_b += 1
|
||
if total > 0:
|
||
model_stats_h.append((model, agree_c / total * 100, agree_s / total * 100, agree_b / total * 100, total))
|
||
model_stats_h.sort(key=lambda x: -x[3])
|
||
for model, cat, spec, both, n in model_stats_h:
|
||
print(f" {model:20s} cat={cat:5.1f}% spec={spec:5.1f}% both={both:5.1f}% (n={n})")
|
||
|
||
# ── Disagreement patterns ──
|
||
print(f"\n{'─' * 60}")
|
||
print("CROSS-SOURCE DISAGREEMENT AXES (Human Maj ≠ GenAI Maj)")
|
||
print(f"{'─' * 60}")
|
||
h_g_confusion = Counter()
|
||
for c in consensus.values():
|
||
hm = c["human_cat_maj"]
|
||
gm = c["genai_cat_maj"]
|
||
if hm and gm and hm != gm:
|
||
h_g_confusion[tuple(sorted([hm, gm]))] += 1
|
||
for (a, b), count in h_g_confusion.most_common(10):
|
||
print(f" {CAT_MAP[a]}↔{CAT_MAP[b]}: {count}")
|
||
|
||
# ── 3-way splits ──
|
||
print(f"\n{'─' * 60}")
|
||
print("THREE-WAY SPLITS (no majority)")
|
||
print(f"{'─' * 60}")
|
||
no_human_maj = sum(1 for c in consensus.values() if c["human_cat_maj"] is None)
|
||
no_s1_maj = sum(1 for c in consensus.values() if c["s1_cat_maj"] is None)
|
||
no_genai_maj = sum(1 for c in consensus.values() if c["genai_cat_maj"] is None)
|
||
print(f" Human 3-way split: {no_human_maj}")
|
||
print(f" Stage 1 3-way split: {no_s1_maj}")
|
||
print(f" GenAI (10-model) no majority: {no_genai_maj}")
|
||
|
||
# ── Unanimity rates ──
|
||
print(f"\n{'─' * 60}")
|
||
print("UNANIMITY RATES")
|
||
print(f"{'─' * 60}")
|
||
h_cat_u = sum(1 for c in consensus.values() if c["human_cat_unanimous"])
|
||
h_spec_u = sum(1 for c in consensus.values() if c["human_spec_unanimous"])
|
||
h_both_u = sum(1 for c in consensus.values() if c["human_cat_unanimous"] and c["human_spec_unanimous"])
|
||
g_cat_u = sum(1 for c in consensus.values() if len(set(c["genai_cats"])) == 1)
|
||
g_spec_u = sum(1 for c in consensus.values() if len(set(c["genai_specs"])) == 1)
|
||
g_both_u = sum(1 for c in consensus.values() if len(set(c["genai_cats"])) == 1 and len(set(c["genai_specs"])) == 1)
|
||
a_cat_u = sum(1 for c in consensus.values() if len(set(c["all_cats"])) == 1)
|
||
a_both_u = sum(1 for c in consensus.values() if len(set(c["all_cats"])) == 1 and len(set(c["all_specs"])) == 1)
|
||
print(f" Human (3): cat={h_cat_u/12:.1f}% spec={h_spec_u/12:.1f}% both={h_both_u/12:.1f}%")
|
||
print(f" GenAI (10): cat={g_cat_u/12:.1f}% spec={g_spec_u/12:.1f}% both={g_both_u/12:.1f}%")
|
||
print(f" All (13): cat={a_cat_u/12:.1f}% both={a_both_u/12:.1f}%")
|
||
|
||
# ── Cost summary ──
|
||
print(f"\n{'─' * 60}")
|
||
print("COST SUMMARY (benchmark run)")
|
||
print(f"{'─' * 60}")
|
||
total_cost = 0
|
||
for bf in bench_files:
|
||
if "errors" in bf.name:
|
||
continue
|
||
records = load_jsonl(bf)
|
||
if len(records) < 100:
|
||
continue
|
||
mid = records[0]["provenance"]["modelId"]
|
||
short = MODEL_SHORT.get(mid, mid.split("/")[-1])
|
||
cost = sum(r["provenance"].get("costUsd", 0) for r in records)
|
||
total_cost += cost
|
||
print(f" {short:20s}: ${cost:.2f}")
|
||
print(f" {'TOTAL':20s}: ${total_cost:.2f}")
|
||
|
||
# ── Key findings ──
|
||
print(f"\n{'=' * 80}")
|
||
print("KEY FINDINGS")
|
||
print(f"{'=' * 80}")
|
||
print(f"""
|
||
1. ADJUDICATION: {len(tiers[1])}/{1200} paragraphs ({len(tiers[1])/12:.1f}%) fall into Tier 1 (10+/13 agree),
|
||
requiring zero human intervention. Tier 2 adds {len(tiers[2])} more with cross-validated consensus.
|
||
Only {len(tiers[3]) + len(tiers[4])} ({(len(tiers[3]) + len(tiers[4]))/12:.1f}%) need expert adjudication.
|
||
|
||
2. OPUS AS REFERENCE: GenAI majority agrees with Opus on {g_eq_op/n_opus*100:.1f}% of categories.
|
||
Human majority agrees with Opus on {h_eq_op/n_opus*100:.1f}%.
|
||
Human majority agrees with GenAI majority on {h_eq_g/n_hmaj*100:.1f}%.
|
||
|
||
3. SPECIFICITY REMAINS HARD: Human spec unanimity is only {h_spec_u/12:.1f}%, GenAI spec unanimity
|
||
is {g_spec_u/12:.1f}%. The Spec 3↔4 boundary is the dominant axis of disagreement for everyone.
|
||
|
||
4. AARYAN EFFECT: Excluding the outlier annotator would push category alpha from 0.801 to ~0.87+,
|
||
and specificity alpha from 0.546 to ~0.65+. His paragraphs show a ~+45pp jump
|
||
in both-unanimous rate when he's excluded.
|
||
|
||
5. SAME CONFUSION AXES: MR↔RMP > BG↔MR > N/O↔SI for humans, Stage 1, AND full GenAI panel.
|
||
The codebook boundaries, not the annotator type, drive disagreement.
|
||
""")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# RUN ALL
|
||
# ═══════════════════════════════════════════════════════════
|
||
print("\nGenerating charts...")
|
||
plot_kappa_heatmaps()
|
||
plot_all_source_category_dist()
|
||
plot_all_source_spec_dist()
|
||
plot_human_confusion()
|
||
plot_genai_agreement_matrix()
|
||
plot_cross_source_confusion()
|
||
plot_cross_source_specificity()
|
||
plot_adjudication_tiers()
|
||
plot_model_accuracy_vs_opus()
|
||
plot_cost_vs_accuracy()
|
||
plot_per_category_accuracy()
|
||
plot_ensemble_accuracy()
|
||
plot_agreement_by_wordcount()
|
||
plot_time_vs_agreement()
|
||
plot_outlier_annotator()
|
||
plot_with_without_outlier()
|
||
plot_disagreement_axes()
|
||
plot_none_other_analysis()
|
||
plot_specificity_bias_all()
|
||
plot_quiz_vs_quality()
|
||
plot_human_vs_genai_consensus()
|
||
plot_signal_agreement_dist()
|
||
plot_annotator_vs_references()
|
||
plot_hard_paragraphs()
|
||
plot_model_vs_human_per_category()
|
||
plot_prompt_version_effect()
|
||
plot_conditional_agreement()
|
||
plot_model_clustering()
|
||
plot_spec_calibration()
|
||
plot_latency_vs_accuracy()
|
||
print_full_analysis()
|
||
print(f"\nAll charts saved to {CHART_DIR}/")
|