1225 lines
51 KiB
Python
1225 lines
51 KiB
Python
"""
|
|
Comprehensive analysis of human labeling data cross-referenced with
|
|
Stage 1 GenAI panel and Opus golden labels.
|
|
|
|
Outputs charts to data/gold/charts/ and a summary to stdout.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.ticker as mticker
|
|
import numpy as np
|
|
|
|
# ── Paths ──
|
|
GOLD_DIR = Path("/home/joey/Documents/sec-cyBERT/data/gold")
|
|
CHART_DIR = GOLD_DIR / "charts"
|
|
STAGE1_PATH = Path("/home/joey/Documents/sec-cyBERT/data/annotations/stage1.patched.jsonl")
|
|
OPUS_PATH = Path("/home/joey/Documents/sec-cyBERT/data/annotations/golden/opus.jsonl")
|
|
HOLDOUT_PATH = GOLD_DIR / "paragraphs-holdout.jsonl"
|
|
LABELS_PATH = GOLD_DIR / "human-labels-raw.jsonl"
|
|
METRICS_PATH = GOLD_DIR / "metrics.json"
|
|
OPUS_ID_MAP_PATH = GOLD_DIR / "opus-to-db-id-map.json"
|
|
|
|
CATEGORIES = [
|
|
"Board Governance", "Management Role", "Risk Management Process",
|
|
"Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
|
|
]
|
|
CAT_SHORT = ["BG", "MR", "RMP", "TPR", "ID", "SI", "N/O"]
|
|
CAT_MAP = dict(zip(CATEGORIES, CAT_SHORT))
|
|
SPEC_LEVELS = [1, 2, 3, 4]
|
|
|
|
CHART_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
records = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
records.append(json.loads(line))
|
|
return records
|
|
|
|
|
|
def majority_vote(items: list[str]) -> str | None:
|
|
"""Return majority item if one exists, else None."""
|
|
c = Counter(items)
|
|
top, count = c.most_common(1)[0]
|
|
return top if count > len(items) / 2 else None
|
|
|
|
|
|
def plurality_vote(items: list) -> tuple:
|
|
"""Return most common item and its count."""
|
|
c = Counter(items)
|
|
return c.most_common(1)[0]
|
|
|
|
|
|
# ── Load data ──
|
|
print("Loading data...")
|
|
human_labels = load_jsonl(LABELS_PATH)
|
|
paragraphs_all = load_jsonl(HOLDOUT_PATH)
|
|
opus_labels = load_jsonl(OPUS_PATH)
|
|
metrics = json.loads(METRICS_PATH.read_text())
|
|
|
|
# Build paragraph metadata lookup (only holdout ones)
|
|
holdout_ids = {l["paragraphId"] for l in human_labels}
|
|
para_meta = {}
|
|
for p in paragraphs_all:
|
|
if p["id"] in holdout_ids:
|
|
para_meta[p["id"]] = p
|
|
|
|
# Load Stage 1 annotations for holdout
|
|
stage1_annots = []
|
|
with open(STAGE1_PATH) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
if d["paragraphId"] in holdout_ids:
|
|
stage1_annots.append(d)
|
|
|
|
# Build lookups
|
|
# Opus labels: only use if we have sufficient coverage (>50% of holdout)
|
|
# The Opus golden run may have been done on a different sample than what's in the DB.
|
|
opus_by_pid: dict[str, dict] = {}
|
|
for r in opus_labels:
|
|
if r["paragraphId"] in holdout_ids:
|
|
opus_by_pid[r["paragraphId"]] = r
|
|
# Also try ID remapping if direct match is low
|
|
if len(opus_by_pid) < 600 and OPUS_ID_MAP_PATH.exists():
|
|
opus_id_map = json.loads(OPUS_ID_MAP_PATH.read_text())
|
|
for r in opus_labels:
|
|
db_pid = opus_id_map.get(r["paragraphId"])
|
|
if db_pid and db_pid in holdout_ids and db_pid not in opus_by_pid:
|
|
opus_by_pid[db_pid] = r
|
|
|
|
OPUS_AVAILABLE = len(opus_by_pid) >= 600 # gate all Opus analysis on sufficient coverage
|
|
opus_coverage = len(opus_by_pid)
|
|
print(f" Opus labels matched to holdout: {opus_coverage}/1200"
|
|
f" {'— SKIPPING Opus analysis (insufficient coverage)' if not OPUS_AVAILABLE else ''}")
|
|
|
|
# Stage 1: 3 annotations per paragraph
|
|
stage1_by_pid: dict[str, list[dict]] = defaultdict(list)
|
|
for a in stage1_annots:
|
|
stage1_by_pid[a["paragraphId"]].append(a)
|
|
|
|
# Human labels grouped by paragraph
|
|
human_by_pid: dict[str, list[dict]] = defaultdict(list)
|
|
for l in human_labels:
|
|
human_by_pid[l["paragraphId"]].append(l)
|
|
|
|
# Annotator names
|
|
annotator_names = sorted({l["annotatorName"] for l in human_labels})
|
|
annotator_ids = sorted({l["annotatorId"] for l in human_labels})
|
|
name_to_id = {}
|
|
for l in human_labels:
|
|
name_to_id[l["annotatorName"]] = l["annotatorId"]
|
|
|
|
print(f" {len(human_labels)} human labels across {len(holdout_ids)} paragraphs")
|
|
print(f" {len(stage1_annots)} Stage 1 annotations")
|
|
print(f" {len(opus_labels)} Opus labels")
|
|
print(f" Annotators: {', '.join(annotator_names)}")
|
|
|
|
# ── Derive per-paragraph consensus labels ──
|
|
consensus = {} # pid -> {human_cat, human_spec, human_cat_method, ...}
|
|
for pid, lbls in human_by_pid.items():
|
|
cats = [l["contentCategory"] for l in lbls]
|
|
specs = [l["specificityLevel"] for l in lbls]
|
|
|
|
cat_maj = majority_vote(cats)
|
|
spec_maj = majority_vote([str(s) for s in specs])
|
|
|
|
# Stage 1
|
|
s1 = stage1_by_pid.get(pid, [])
|
|
s1_cats = [a["label"]["content_category"] for a in s1]
|
|
s1_specs = [a["label"]["specificity_level"] for a in s1]
|
|
s1_cat_maj = majority_vote(s1_cats) if s1_cats else None
|
|
s1_spec_maj = majority_vote([str(s) for s in s1_specs]) if s1_specs else None
|
|
|
|
# Opus
|
|
op = opus_by_pid.get(pid)
|
|
op_cat = op["label"]["content_category"] if op else None
|
|
op_spec = op["label"]["specificity_level"] if op else None
|
|
|
|
consensus[pid] = {
|
|
"human_cats": cats,
|
|
"human_specs": specs,
|
|
"human_cat_maj": cat_maj,
|
|
"human_spec_maj": int(spec_maj) if spec_maj else None,
|
|
"human_cat_unanimous": len(set(cats)) == 1,
|
|
"human_spec_unanimous": len(set(specs)) == 1,
|
|
"s1_cats": s1_cats,
|
|
"s1_specs": s1_specs,
|
|
"s1_cat_maj": s1_cat_maj,
|
|
"s1_spec_maj": int(s1_spec_maj) if s1_spec_maj else None,
|
|
"s1_cat_unanimous": len(set(s1_cats)) == 1 if s1_cats else False,
|
|
"opus_cat": op_cat,
|
|
"opus_spec": op_spec,
|
|
"word_count": para_meta.get(pid, {}).get("wordCount", 0),
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 1: Pairwise Kappa Heatmaps (category + specificity)
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_kappa_heatmaps():
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5.5))
|
|
|
|
for ax, dim_key, title in [
|
|
(ax1, "category", "Category"),
|
|
(ax2, "specificity", "Specificity"),
|
|
]:
|
|
data = metrics["pairwiseKappa"][dim_key]
|
|
names = data["annotators"]
|
|
matrix = np.array(data["matrix"])
|
|
|
|
# Mask diagonal
|
|
mask = np.eye(len(names), dtype=bool)
|
|
display = np.where(mask, np.nan, matrix)
|
|
|
|
im = ax.imshow(display, cmap="RdYlGn", vmin=0, vmax=1, aspect="equal")
|
|
ax.set_xticks(range(len(names)))
|
|
ax.set_xticklabels(names, rotation=45, ha="right", fontsize=9)
|
|
ax.set_yticks(range(len(names)))
|
|
ax.set_yticklabels(names, fontsize=9)
|
|
ax.set_title(f"Pairwise Cohen's κ — {title}", fontsize=12, fontweight="bold")
|
|
|
|
for i in range(len(names)):
|
|
for j in range(len(names)):
|
|
if i != j:
|
|
color = "white" if matrix[i][j] < 0.4 else "black"
|
|
ax.text(j, i, f"{matrix[i][j]:.2f}", ha="center", va="center",
|
|
fontsize=8, color=color)
|
|
|
|
fig.colorbar(im, ax=[ax1, ax2], shrink=0.8, label="Cohen's κ")
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "01_kappa_heatmaps.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 01_kappa_heatmaps.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 2: Per-annotator category distribution
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_annotator_category_dist():
|
|
fig, ax = plt.subplots(figsize=(12, 6))
|
|
|
|
# Also add Stage 1 majority (and Opus if available)
|
|
sources = list(annotator_names) + ["Stage1 Maj"] + (["Opus"] if OPUS_AVAILABLE else [])
|
|
|
|
dist = {s: Counter() for s in sources}
|
|
for l in human_labels:
|
|
dist[l["annotatorName"]][l["contentCategory"]] += 1
|
|
|
|
for pid, c in consensus.items():
|
|
if c["s1_cat_maj"]:
|
|
dist["Stage1 Maj"][c["s1_cat_maj"]] += 1
|
|
if OPUS_AVAILABLE and c["opus_cat"]:
|
|
dist["Opus"][c["opus_cat"]] += 1
|
|
|
|
x = np.arange(len(sources))
|
|
width = 0.11
|
|
offsets = np.arange(len(CATEGORIES)) - len(CATEGORIES) / 2 + 0.5
|
|
|
|
colors = plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES)))
|
|
|
|
for i, (cat, color) in enumerate(zip(CATEGORIES, colors)):
|
|
counts = [dist[s].get(cat, 0) for s in sources]
|
|
totals = [sum(dist[s].values()) for s in sources]
|
|
pcts = [c / t * 100 if t > 0 else 0 for c, t in zip(counts, totals)]
|
|
ax.bar(x + offsets[i] * width, pcts, width, label=CAT_MAP[cat], color=color)
|
|
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(sources, rotation=45, ha="right")
|
|
ax.set_ylabel("% of labels")
|
|
ax.set_title("Category Distribution by Annotator (incl. Stage1 & Opus)", fontweight="bold")
|
|
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
|
|
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "02_category_distribution.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 02_category_distribution.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 3: Per-annotator specificity distribution
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_annotator_spec_dist():
|
|
fig, ax = plt.subplots(figsize=(12, 5))
|
|
|
|
sources = list(annotator_names) + ["Stage1 Maj"] + (["Opus"] if OPUS_AVAILABLE else [])
|
|
|
|
dist = {s: Counter() for s in sources}
|
|
for l in human_labels:
|
|
dist[l["annotatorName"]][l["specificityLevel"]] += 1
|
|
|
|
for pid, c in consensus.items():
|
|
if c["s1_spec_maj"]:
|
|
dist["Stage1 Maj"][c["s1_spec_maj"]] += 1
|
|
if OPUS_AVAILABLE and c["opus_spec"]:
|
|
dist["Opus"][c["opus_spec"]] += 1
|
|
|
|
x = np.arange(len(sources))
|
|
width = 0.18
|
|
colors = ["#e74c3c", "#f39c12", "#2ecc71", "#3498db"]
|
|
spec_labels = ["1 Generic", "2 Sector", "3 Firm-Specific", "4 Quantified"]
|
|
|
|
for i, (level, color, label) in enumerate(zip(SPEC_LEVELS, colors, spec_labels)):
|
|
counts = [dist[s].get(level, 0) for s in sources]
|
|
totals = [sum(dist[s].values()) for s in sources]
|
|
pcts = [c / t * 100 if t > 0 else 0 for c, t in zip(counts, totals)]
|
|
ax.bar(x + (i - 1.5) * width, pcts, width, label=label, color=color)
|
|
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(sources, rotation=45, ha="right")
|
|
ax.set_ylabel("% of labels")
|
|
ax.set_title("Specificity Distribution by Annotator (incl. Stage1 & Opus)", fontweight="bold")
|
|
ax.legend()
|
|
ax.yaxis.set_major_formatter(mticker.PercentFormatter())
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "03_specificity_distribution.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 03_specificity_distribution.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 4: Human confusion matrix (aggregated pairwise)
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_human_confusion():
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
|
|
|
# Category confusion
|
|
cat_conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
|
|
cat_idx = {c: i for i, c in enumerate(CATEGORIES)}
|
|
|
|
for pid, lbls in human_by_pid.items():
|
|
cats = [l["contentCategory"] for l in lbls]
|
|
for i in range(len(cats)):
|
|
for j in range(i + 1, len(cats)):
|
|
a, b = cat_idx[cats[i]], cat_idx[cats[j]]
|
|
cat_conf[a][b] += 1
|
|
cat_conf[b][a] += 1
|
|
|
|
# Normalize rows
|
|
row_sums = cat_conf.sum(axis=1, keepdims=True)
|
|
cat_conf_norm = np.where(row_sums > 0, cat_conf / row_sums * 100, 0)
|
|
|
|
im1 = ax1.imshow(cat_conf_norm, cmap="YlOrRd", aspect="equal")
|
|
ax1.set_xticks(range(len(CAT_SHORT)))
|
|
ax1.set_xticklabels(CAT_SHORT, fontsize=9)
|
|
ax1.set_yticks(range(len(CAT_SHORT)))
|
|
ax1.set_yticklabels(CAT_SHORT, fontsize=9)
|
|
ax1.set_title("Human Category Confusion (row-normalized %)", fontweight="bold")
|
|
ax1.set_xlabel("Annotator B")
|
|
ax1.set_ylabel("Annotator A")
|
|
|
|
for i in range(len(CAT_SHORT)):
|
|
for j in range(len(CAT_SHORT)):
|
|
val = cat_conf_norm[i][j]
|
|
if val > 0.5:
|
|
color = "white" if val > 40 else "black"
|
|
ax1.text(j, i, f"{val:.0f}", ha="center", va="center",
|
|
fontsize=7, color=color)
|
|
|
|
# Specificity confusion
|
|
spec_conf = np.zeros((4, 4))
|
|
for pid, lbls in human_by_pid.items():
|
|
specs = [l["specificityLevel"] for l in lbls]
|
|
for i in range(len(specs)):
|
|
for j in range(i + 1, len(specs)):
|
|
a, b = specs[i] - 1, specs[j] - 1
|
|
spec_conf[a][b] += 1
|
|
spec_conf[b][a] += 1
|
|
|
|
row_sums = spec_conf.sum(axis=1, keepdims=True)
|
|
spec_conf_norm = np.where(row_sums > 0, spec_conf / row_sums * 100, 0)
|
|
|
|
im2 = ax2.imshow(spec_conf_norm, cmap="YlOrRd", aspect="equal")
|
|
ax2.set_xticks(range(4))
|
|
ax2.set_xticklabels(["Spec 1", "Spec 2", "Spec 3", "Spec 4"], fontsize=9)
|
|
ax2.set_yticks(range(4))
|
|
ax2.set_yticklabels(["Spec 1", "Spec 2", "Spec 3", "Spec 4"], fontsize=9)
|
|
ax2.set_title("Human Specificity Confusion (row-normalized %)", fontweight="bold")
|
|
|
|
for i in range(4):
|
|
for j in range(4):
|
|
val = spec_conf_norm[i][j]
|
|
if val > 0.5:
|
|
color = "white" if val > 40 else "black"
|
|
ax2.text(j, i, f"{val:.0f}", ha="center", va="center",
|
|
fontsize=9, color=color)
|
|
|
|
fig.colorbar(im1, ax=ax1, shrink=0.8)
|
|
fig.colorbar(im2, ax=ax2, shrink=0.8)
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "04_human_confusion.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 04_human_confusion.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 5: Human majority vs Stage 1 majority vs Opus
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_cross_source_confusion():
|
|
comparisons = [
|
|
("Human Maj", "Stage1 Maj", "human_cat_maj", "s1_cat_maj"),
|
|
]
|
|
if OPUS_AVAILABLE:
|
|
comparisons += [
|
|
("Human Maj", "Opus", "human_cat_maj", "opus_cat"),
|
|
("Stage1 Maj", "Opus", "s1_cat_maj", "opus_cat"),
|
|
]
|
|
ncols = len(comparisons)
|
|
fig, axes = plt.subplots(1, ncols, figsize=(7 * ncols, 5.5))
|
|
if ncols == 1:
|
|
axes = [axes]
|
|
|
|
for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
|
|
conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
|
|
cat_idx = {c: i for i, c in enumerate(CATEGORIES)}
|
|
total = 0
|
|
agree = 0
|
|
|
|
for pid, c in consensus.items():
|
|
a_val = c[key_a]
|
|
b_val = c[key_b]
|
|
if a_val and b_val:
|
|
conf[cat_idx[a_val]][cat_idx[b_val]] += 1
|
|
total += 1
|
|
if a_val == b_val:
|
|
agree += 1
|
|
|
|
# Normalize rows
|
|
row_sums = conf.sum(axis=1, keepdims=True)
|
|
conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)
|
|
|
|
im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
|
|
ax.set_xticks(range(len(CAT_SHORT)))
|
|
ax.set_xticklabels(CAT_SHORT, fontsize=8)
|
|
ax.set_yticks(range(len(CAT_SHORT)))
|
|
ax.set_yticklabels(CAT_SHORT, fontsize=8)
|
|
pct = agree / total * 100 if total > 0 else 0
|
|
ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})",
|
|
fontweight="bold", fontsize=10)
|
|
ax.set_ylabel(name_a)
|
|
ax.set_xlabel(name_b)
|
|
|
|
for i in range(len(CAT_SHORT)):
|
|
for j in range(len(CAT_SHORT)):
|
|
val = conf_norm[i][j]
|
|
if val > 0.5:
|
|
color = "white" if val > 50 else "black"
|
|
ax.text(j, i, f"{val:.0f}", ha="center", va="center",
|
|
fontsize=7, color=color)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "05_cross_source_category.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 05_cross_source_category.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 6: Cross-source specificity confusion
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_cross_source_specificity():
|
|
comparisons = [
|
|
("Human Maj", "Stage1 Maj", "human_spec_maj", "s1_spec_maj"),
|
|
]
|
|
if OPUS_AVAILABLE:
|
|
comparisons += [
|
|
("Human Maj", "Opus", "human_spec_maj", "opus_spec"),
|
|
("Stage1 Maj", "Opus", "s1_spec_maj", "opus_spec"),
|
|
]
|
|
ncols = len(comparisons)
|
|
fig, axes = plt.subplots(1, ncols, figsize=(5.5 * ncols, 4.5))
|
|
if ncols == 1:
|
|
axes = [axes]
|
|
|
|
for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
|
|
conf = np.zeros((4, 4))
|
|
total = 0
|
|
agree = 0
|
|
|
|
for pid, c in consensus.items():
|
|
a_val = c[key_a]
|
|
b_val = c[key_b]
|
|
if a_val is not None and b_val is not None:
|
|
conf[a_val - 1][b_val - 1] += 1
|
|
total += 1
|
|
if a_val == b_val:
|
|
agree += 1
|
|
|
|
row_sums = conf.sum(axis=1, keepdims=True)
|
|
conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)
|
|
|
|
im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
|
|
ax.set_xticks(range(4))
|
|
ax.set_xticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
|
|
ax.set_yticks(range(4))
|
|
ax.set_yticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
|
|
pct = agree / total * 100 if total > 0 else 0
|
|
ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})",
|
|
fontweight="bold", fontsize=10)
|
|
ax.set_ylabel(name_a)
|
|
ax.set_xlabel(name_b)
|
|
|
|
for i in range(4):
|
|
for j in range(4):
|
|
val = conf_norm[i][j]
|
|
if val > 0.5:
|
|
color = "white" if val > 50 else "black"
|
|
ax.text(j, i, f"{val:.0f}", ha="center", va="center",
|
|
fontsize=9, color=color)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "06_cross_source_specificity.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 06_cross_source_specificity.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 7: Per-annotator agreement with Stage1 and Opus
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_annotator_vs_references():
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
|
|
|
# Build per-annotator label lookup
|
|
ann_labels: dict[str, dict[str, dict]] = defaultdict(dict)
|
|
for l in human_labels:
|
|
ann_labels[l["annotatorName"]][l["paragraphId"]] = l
|
|
|
|
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "spec", "Specificity")]:
|
|
ref_sources = [
|
|
("Stage1 Maj", "s1_cat_maj", "s1_spec_maj"),
|
|
("Human Maj", "human_cat_maj", "human_spec_maj"),
|
|
]
|
|
if OPUS_AVAILABLE:
|
|
ref_sources.insert(1, ("Opus", "opus_cat", "opus_spec"))
|
|
|
|
x = np.arange(len(annotator_names))
|
|
width = 0.25 if len(ref_sources) == 3 else 0.3
|
|
|
|
for ri, (ref_name, ref_key_cat, ref_key_spec) in enumerate(ref_sources):
|
|
rates = []
|
|
for ann_name in annotator_names:
|
|
agree = 0
|
|
total = 0
|
|
for pid, lbl in ann_labels[ann_name].items():
|
|
c = consensus.get(pid)
|
|
if not c:
|
|
continue
|
|
if dim == "cat":
|
|
ref_val = c[ref_key_cat]
|
|
ann_val = lbl["contentCategory"]
|
|
else:
|
|
ref_val = c[ref_key_spec]
|
|
ann_val = lbl["specificityLevel"]
|
|
if ref_val is not None:
|
|
total += 1
|
|
if str(ann_val) == str(ref_val):
|
|
agree += 1
|
|
rates.append(agree / total * 100 if total > 0 else 0)
|
|
|
|
ax.bar(x + (ri - 1) * width, rates, width, label=ref_name)
|
|
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(annotator_names, rotation=45, ha="right")
|
|
ax.set_ylabel("Agreement %")
|
|
ax.set_title(f"Per-Annotator {title} Agreement with References", fontweight="bold")
|
|
ax.legend()
|
|
ax.set_ylim(0, 100)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "07_annotator_vs_references.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 07_annotator_vs_references.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 8: Agreement rate by word count (binned)
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_agreement_by_wordcount():
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
|
|
|
# Bin paragraphs by word count
|
|
wc_bins = [(0, 50), (51, 80), (81, 120), (121, 180), (181, 500)]
|
|
bin_labels = ["≤50", "51-80", "81-120", "121-180", "180+"]
|
|
|
|
for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "both", "Both")]:
|
|
rates = []
|
|
ns = []
|
|
for lo, hi in wc_bins:
|
|
agree = 0
|
|
total = 0
|
|
for pid, c in consensus.items():
|
|
wc = c["word_count"]
|
|
if lo <= wc <= hi:
|
|
total += 1
|
|
if dim == "cat":
|
|
if c["human_cat_unanimous"]:
|
|
agree += 1
|
|
else:
|
|
if c["human_cat_unanimous"] and c["human_spec_unanimous"]:
|
|
agree += 1
|
|
rates.append(agree / total * 100 if total > 0 else 0)
|
|
ns.append(total)
|
|
|
|
bars = ax.bar(range(len(bin_labels)), rates, color="#3498db")
|
|
ax.set_xticks(range(len(bin_labels)))
|
|
ax.set_xticklabels(bin_labels)
|
|
ax.set_xlabel("Word Count")
|
|
ax.set_ylabel("Unanimous Agreement %")
|
|
ax.set_title(f"{title} Consensus by Paragraph Length", fontweight="bold")
|
|
ax.set_ylim(0, 80)
|
|
|
|
for bar, n in zip(bars, ns):
|
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
|
|
f"n={n}", ha="center", va="bottom", fontsize=8)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "08_agreement_by_wordcount.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 08_agreement_by_wordcount.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 9: Active time vs agreement
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_time_vs_agreement():
|
|
fig, ax = plt.subplots(figsize=(10, 5))
|
|
|
|
# For each paragraph, compute median active time and whether humans agreed
|
|
agreed_times = []
|
|
disagreed_times = []
|
|
|
|
for pid, lbls in human_by_pid.items():
|
|
times = [l["activeMs"] for l in lbls if l["activeMs"] is not None]
|
|
if not times:
|
|
continue
|
|
med_time = sorted(times)[len(times) // 2] / 1000 # seconds
|
|
|
|
cats = [l["contentCategory"] for l in lbls]
|
|
if len(set(cats)) == 1:
|
|
agreed_times.append(med_time)
|
|
else:
|
|
disagreed_times.append(med_time)
|
|
|
|
bins = np.linspace(0, 120, 30)
|
|
ax.hist(agreed_times, bins=bins, alpha=0.6, label=f"Category agreed (n={len(agreed_times)})",
|
|
color="#2ecc71", density=True)
|
|
ax.hist(disagreed_times, bins=bins, alpha=0.6, label=f"Category disagreed (n={len(disagreed_times)})",
|
|
color="#e74c3c", density=True)
|
|
ax.set_xlabel("Median Active Time per Paragraph (seconds)")
|
|
ax.set_ylabel("Density")
|
|
ax.set_title("Labeling Time: Agreed vs Disagreed Paragraphs", fontweight="bold")
|
|
ax.legend()
|
|
ax.set_xlim(0, 120)
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "09_time_vs_agreement.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 09_time_vs_agreement.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 10: None/Other deep dive — what do people label instead?
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_none_other_analysis():
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
|
|
|
# For paragraphs where at least one annotator said None/Other
|
|
# What did the others say?
|
|
noneother_vs = Counter()
|
|
noneother_pids = set()
|
|
for pid, lbls in human_by_pid.items():
|
|
cats = [l["contentCategory"] for l in lbls]
|
|
if "None/Other" in cats and len(set(cats)) > 1:
|
|
noneother_pids.add(pid)
|
|
for c in cats:
|
|
if c != "None/Other":
|
|
noneother_vs[c] += 1
|
|
|
|
# Also: paragraphs where NO human said None/Other but Stage1 or Opus did
|
|
s1_noneother_human_not = Counter()
|
|
for pid, c in consensus.items():
|
|
human_cats = set(c["human_cats"])
|
|
if "None/Other" not in human_cats:
|
|
if c["s1_cat_maj"] == "None/Other":
|
|
for hc in c["human_cats"]:
|
|
s1_noneother_human_not[hc] += 1
|
|
|
|
cats_sorted = sorted(noneother_vs.keys(), key=lambda c: -noneother_vs[c])
|
|
ax1.barh(range(len(cats_sorted)), [noneother_vs[c] for c in cats_sorted], color="#e74c3c")
|
|
ax1.set_yticks(range(len(cats_sorted)))
|
|
ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted])
|
|
ax1.set_xlabel("Count")
|
|
ax1.set_title(f"When someone says N/O but others disagree\n({len(noneother_pids)} paragraphs)",
|
|
fontweight="bold")
|
|
ax1.invert_yaxis()
|
|
|
|
# What does Stage1 say when humans disagree on category?
|
|
s1_for_disagreed = Counter()
|
|
for pid, c in consensus.items():
|
|
if not c["human_cat_unanimous"] and c["s1_cat_maj"]:
|
|
s1_for_disagreed[c["s1_cat_maj"]] += 1
|
|
|
|
cats_sorted2 = sorted(s1_for_disagreed.keys(), key=lambda c: -s1_for_disagreed[c])
|
|
ax2.barh(range(len(cats_sorted2)), [s1_for_disagreed[c] for c in cats_sorted2], color="#3498db")
|
|
ax2.set_yticks(range(len(cats_sorted2)))
|
|
ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted2])
|
|
ax2.set_xlabel("Count")
|
|
ax2.set_title(f"Stage1 majority for human-disagreed paragraphs\n(n={sum(s1_for_disagreed.values())})",
|
|
fontweight="bold")
|
|
ax2.invert_yaxis()
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "10_none_other_analysis.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 10_none_other_analysis.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 11: Aaryan vs everyone else — where does he diverge?
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_outlier_annotator():
|
|
# Find the annotator with lowest avg kappa
|
|
cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
|
|
ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
|
|
for pair in cat_kappas:
|
|
ann_kappa_sum[pair["a1"]]["sum"] += pair["kappa"]
|
|
ann_kappa_sum[pair["a1"]]["n"] += 1
|
|
ann_kappa_sum[pair["a2"]]["sum"] += pair["kappa"]
|
|
ann_kappa_sum[pair["a2"]]["n"] += 1
|
|
|
|
outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])
|
|
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
|
|
|
# What does the outlier label differently?
|
|
# Compare outlier's category choices vs the majority of the other 2 annotators
|
|
outlier_id = name_to_id.get(outlier, outlier)
|
|
outlier_diverge_from = Counter() # (outlier_cat, others_cat) pairs
|
|
outlier_diverge_to = Counter()
|
|
|
|
for pid, lbls in human_by_pid.items():
|
|
outlier_lbl = None
|
|
others = []
|
|
for l in lbls:
|
|
if l["annotatorName"] == outlier:
|
|
outlier_lbl = l
|
|
else:
|
|
others.append(l)
|
|
|
|
if outlier_lbl and len(others) >= 2:
|
|
other_cats = [o["contentCategory"] for o in others]
|
|
if other_cats[0] == other_cats[1] and other_cats[0] != outlier_lbl["contentCategory"]:
|
|
outlier_diverge_from[other_cats[0]] += 1
|
|
outlier_diverge_to[outlier_lbl["contentCategory"]] += 1
|
|
|
|
# Diverge FROM (what category the others agreed on)
|
|
cats1 = sorted(outlier_diverge_from.keys(), key=lambda c: -outlier_diverge_from[c])
|
|
ax1.barh(range(len(cats1)), [outlier_diverge_from[c] for c in cats1], color="#e74c3c")
|
|
ax1.set_yticks(range(len(cats1)))
|
|
ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats1])
|
|
ax1.set_xlabel("Count")
|
|
ax1.set_title(f"{outlier} disagrees: what others chose\n(others agreed, {outlier} didn't)",
|
|
fontweight="bold")
|
|
ax1.invert_yaxis()
|
|
|
|
# Diverge TO (what did the outlier pick instead)
|
|
cats2 = sorted(outlier_diverge_to.keys(), key=lambda c: -outlier_diverge_to[c])
|
|
ax2.barh(range(len(cats2)), [outlier_diverge_to[c] for c in cats2], color="#f39c12")
|
|
ax2.set_yticks(range(len(cats2)))
|
|
ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats2])
|
|
ax2.set_xlabel("Count")
|
|
ax2.set_title(f"What {outlier} chose instead", fontweight="bold")
|
|
ax2.invert_yaxis()
|
|
|
|
fig.suptitle(f"Outlier Analysis: {outlier} (lowest avg κ = "
|
|
f"{ann_kappa_sum[outlier]['sum']/ann_kappa_sum[outlier]['n']:.3f})",
|
|
fontweight="bold", fontsize=12)
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "11_outlier_annotator.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 11_outlier_annotator.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 12: Human vs GenAI consensus comparison
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_human_vs_genai_consensus():
|
|
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
|
|
|
|
# For each paragraph: human unanimity vs stage1 unanimity
|
|
# Quadrants: both agree, human only, stage1 only, neither
|
|
human_unan_cat = sum(1 for c in consensus.values() if c["human_cat_unanimous"])
|
|
s1_unan_cat = sum(1 for c in consensus.values() if c["s1_cat_unanimous"])
|
|
both_unan_cat = sum(1 for c in consensus.values()
|
|
if c["human_cat_unanimous"] and c["s1_cat_unanimous"])
|
|
|
|
human_unan_spec = sum(1 for c in consensus.values() if c["human_spec_unanimous"])
|
|
s1_unan_spec = sum(1 for c in consensus.values()
|
|
if len(set(c["s1_specs"])) == 1 if c["s1_specs"])
|
|
|
|
# Chart 1: Category agreement Venn-style comparison
|
|
ax = axes[0]
|
|
labels_data = ["Human\nunanimous", "Stage1\nunanimous", "Both\nunanimous"]
|
|
vals = [human_unan_cat, s1_unan_cat, both_unan_cat]
|
|
pcts = [v / 1200 * 100 for v in vals]
|
|
bars = ax.bar(range(3), pcts, color=["#3498db", "#e74c3c", "#2ecc71"])
|
|
ax.set_xticks(range(3))
|
|
ax.set_xticklabels(labels_data)
|
|
ax.set_ylabel("%")
|
|
ax.set_title("Category Unanimity Rates", fontweight="bold")
|
|
for bar, v, p in zip(bars, vals, pcts):
|
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
|
|
f"{p:.1f}%\n({v})", ha="center", fontsize=9)
|
|
|
|
# Chart 2: Human vs Stage1 category agreement breakdown
|
|
ax = axes[1]
|
|
both_agree = 0 # human unanimous AND matches s1
|
|
human_unan_s1_diff = 0 # human unanimous but s1 differs
|
|
s1_unan_human_diff = 0 # s1 unanimous but human majority differs
|
|
both_majority_agree = 0 # neither unanimous but majorities match
|
|
majorities_differ = 0
|
|
|
|
for pid, c in consensus.items():
|
|
hm = c["human_cat_maj"]
|
|
sm = c["s1_cat_maj"]
|
|
hu = c["human_cat_unanimous"]
|
|
su = c["s1_cat_unanimous"]
|
|
if not hm or not sm:
|
|
continue
|
|
if hm == sm:
|
|
both_majority_agree += 1
|
|
else:
|
|
majorities_differ += 1
|
|
|
|
total = both_majority_agree + majorities_differ
|
|
vals = [both_majority_agree, majorities_differ]
|
|
pcts = [v / total * 100 for v in vals]
|
|
labels_d = ["Majorities\nagree", "Majorities\ndiffer"]
|
|
colors_d = ["#2ecc71", "#e74c3c"]
|
|
bars = ax.bar(range(2), pcts, color=colors_d)
|
|
ax.set_xticks(range(2))
|
|
ax.set_xticklabels(labels_d)
|
|
ax.set_ylabel("%")
|
|
ax.set_title(f"Human vs Stage1 Category Agreement\n(n={total})", fontweight="bold")
|
|
for bar, v, p in zip(bars, vals, pcts):
|
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
|
|
f"{v}\n({p:.1f}%)", ha="center", fontsize=9)
|
|
|
|
# Chart 3: Same for specificity
|
|
ax = axes[2]
|
|
spec_agree = 0
|
|
spec_differ = 0
|
|
for pid, c in consensus.items():
|
|
hm = c["human_spec_maj"]
|
|
sm = c["s1_spec_maj"]
|
|
if hm is None or sm is None:
|
|
continue
|
|
if hm == sm:
|
|
spec_agree += 1
|
|
else:
|
|
spec_differ += 1
|
|
|
|
total = spec_agree + spec_differ
|
|
vals = [spec_agree, spec_differ]
|
|
pcts = [v / total * 100 for v in vals]
|
|
bars = ax.bar(range(2), pcts, color=colors_d)
|
|
ax.set_xticks(range(2))
|
|
ax.set_xticklabels(labels_d)
|
|
ax.set_ylabel("%")
|
|
ax.set_title(f"Human vs Stage1 Specificity Agreement\n(n={total})", fontweight="bold")
|
|
for bar, v, p in zip(bars, vals, pcts):
|
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
|
|
f"{v}\n({p:.1f}%)", ha="center", fontsize=9)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "12_human_vs_genai_consensus.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 12_human_vs_genai_consensus.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 13: Per-annotator specificity bias
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_specificity_bias():
|
|
fig, ax = plt.subplots(figsize=(10, 5))
|
|
|
|
# For each annotator, compare their spec vs Opus spec
|
|
ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
|
|
for l in human_labels:
|
|
ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l
|
|
|
|
names = annotator_names
|
|
biases = [] # mean(annotator_spec - stage1_majority_spec)
|
|
for name in names:
|
|
diffs = []
|
|
for pid, lbl in ann_labels_by_name[name].items():
|
|
c = consensus.get(pid)
|
|
if c and c["s1_spec_maj"] is not None:
|
|
diffs.append(lbl["specificityLevel"] - c["s1_spec_maj"])
|
|
biases.append(np.mean(diffs) if diffs else 0)
|
|
|
|
colors = ["#e74c3c" if b < -0.1 else "#2ecc71" if b > 0.1 else "#95a5a6" for b in biases]
|
|
bars = ax.bar(range(len(names)), biases, color=colors)
|
|
ax.set_xticks(range(len(names)))
|
|
ax.set_xticklabels(names, rotation=45, ha="right")
|
|
ax.set_ylabel("Mean (Human - Stage1 Maj) Specificity")
|
|
ax.set_title("Specificity Bias vs Stage1 (negative = under-rates, positive = over-rates)",
|
|
fontweight="bold")
|
|
ax.axhline(0, color="black", linewidth=0.5)
|
|
|
|
for bar, b in zip(bars, biases):
|
|
ax.text(bar.get_x() + bar.get_width() / 2,
|
|
bar.get_height() + (0.02 if b >= 0 else -0.05),
|
|
f"{b:+.2f}", ha="center", fontsize=9)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "13_specificity_bias.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 13_specificity_bias.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 14: Disagreement axes — human vs GenAI top confusions
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_disagreement_axes():
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
|
|
|
# Human disagreement axes (where 2 annotators agree, 1 disagrees)
|
|
human_axes = Counter()
|
|
for pid, lbls in human_by_pid.items():
|
|
cats = [l["contentCategory"] for l in lbls]
|
|
if len(set(cats)) == 2:
|
|
c = Counter(cats)
|
|
items = c.most_common()
|
|
axis = tuple(sorted([items[0][0], items[1][0]]))
|
|
human_axes[axis] += 1
|
|
elif len(set(cats)) == 3:
|
|
for i, c1 in enumerate(cats):
|
|
for c2 in cats[i+1:]:
|
|
if c1 != c2:
|
|
axis = tuple(sorted([c1, c2]))
|
|
human_axes[axis] += 1
|
|
|
|
top_human = human_axes.most_common(10)
|
|
labels_h = [f"{CAT_MAP[a]}↔{CAT_MAP[b]}" for (a, b), _ in top_human]
|
|
counts_h = [c for _, c in top_human]
|
|
|
|
ax1.barh(range(len(labels_h)), counts_h, color="#e74c3c")
|
|
ax1.set_yticks(range(len(labels_h)))
|
|
ax1.set_yticklabels(labels_h, fontsize=9)
|
|
ax1.set_xlabel("Disagreement count")
|
|
ax1.set_title("Human Top Disagreement Axes", fontweight="bold")
|
|
ax1.invert_yaxis()
|
|
|
|
# Stage 1 disagreement axes on same paragraphs
|
|
s1_axes = Counter()
|
|
for pid, c in consensus.items():
|
|
s1_cats = c["s1_cats"]
|
|
if len(set(s1_cats)) == 2:
|
|
cnt = Counter(s1_cats)
|
|
items = cnt.most_common()
|
|
axis = tuple(sorted([items[0][0], items[1][0]]))
|
|
s1_axes[axis] += 1
|
|
elif len(set(s1_cats)) == 3:
|
|
for i, c1 in enumerate(s1_cats):
|
|
for c2 in s1_cats[i+1:]:
|
|
if c1 != c2:
|
|
axis = tuple(sorted([c1, c2]))
|
|
s1_axes[axis] += 1
|
|
|
|
top_s1 = s1_axes.most_common(10)
|
|
labels_s = [f"{CAT_MAP[a]}↔{CAT_MAP[b]}" for (a, b), _ in top_s1]
|
|
counts_s = [c for _, c in top_s1]
|
|
|
|
ax2.barh(range(len(labels_s)), counts_s, color="#3498db")
|
|
ax2.set_yticks(range(len(labels_s)))
|
|
ax2.set_yticklabels(labels_s, fontsize=9)
|
|
ax2.set_xlabel("Disagreement count")
|
|
ax2.set_title("Stage 1 Top Disagreement Axes (same paragraphs)", fontweight="bold")
|
|
ax2.invert_yaxis()
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "14_disagreement_axes.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 14_disagreement_axes.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 15: Quiz performance vs labeling quality
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_quiz_vs_quality():
|
|
fig, ax = plt.subplots(figsize=(10, 5))
|
|
|
|
# Load quiz data
|
|
quiz_sessions = load_jsonl(GOLD_DIR / "quiz-sessions.jsonl")
|
|
|
|
# Best quiz score per annotator
|
|
best_quiz: dict[str, int] = {}
|
|
attempts: dict[str, int] = defaultdict(int)
|
|
for q in quiz_sessions:
|
|
name = q["annotatorName"]
|
|
attempts[name] += 1
|
|
if q["passed"]:
|
|
if name not in best_quiz or q["score"] > best_quiz[name]:
|
|
best_quiz[name] = q["score"]
|
|
|
|
# Agreement rate with Stage1 majority per annotator
|
|
ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
|
|
for l in human_labels:
|
|
ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l
|
|
|
|
s1_agree = {}
|
|
for name in annotator_names:
|
|
agree = 0
|
|
total = 0
|
|
for pid, lbl in ann_labels_by_name[name].items():
|
|
c = consensus.get(pid)
|
|
if c and c["s1_cat_maj"]:
|
|
total += 1
|
|
if lbl["contentCategory"] == c["s1_cat_maj"]:
|
|
agree += 1
|
|
s1_agree[name] = agree / total * 100 if total > 0 else 0
|
|
|
|
x = np.arange(len(annotator_names))
|
|
width = 0.35
|
|
ax.bar(x - width/2, [attempts.get(n, 0) for n in annotator_names],
|
|
width, label="Quiz attempts", color="#f39c12")
|
|
ax2 = ax.twinx()
|
|
ax2.bar(x + width/2, [s1_agree.get(n, 0) for n in annotator_names],
|
|
width, label="Category agree w/ Stage1 (%)", color="#3498db", alpha=0.7)
|
|
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(annotator_names, rotation=45, ha="right")
|
|
ax.set_ylabel("Quiz attempts", color="#f39c12")
|
|
ax2.set_ylabel("Opus agreement %", color="#3498db")
|
|
ax.set_title("Quiz Attempts vs Labeling Quality (Stage1 Agreement)", fontweight="bold")
|
|
|
|
lines1, labels1 = ax.get_legend_handles_labels()
|
|
lines2, labels2 = ax2.get_legend_handles_labels()
|
|
ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left")
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "15_quiz_vs_quality.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 15_quiz_vs_quality.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# CHART 16: Aaryan-excluded metrics comparison
|
|
# ═══════════════════════════════════════════════════════════
|
|
def plot_with_without_outlier():
|
|
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
|
|
|
|
# Find outlier (lowest avg kappa)
|
|
cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
|
|
ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
|
|
for pair in cat_kappas:
|
|
ann_kappa_sum[pair["a1"]]["sum"] += pair["kappa"]
|
|
ann_kappa_sum[pair["a1"]]["n"] += 1
|
|
ann_kappa_sum[pair["a2"]]["sum"] += pair["kappa"]
|
|
ann_kappa_sum[pair["a2"]]["n"] += 1
|
|
outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])
|
|
|
|
# Compute consensus with and without outlier
|
|
# For paragraphs where outlier participated
|
|
outlier_participated = 0
|
|
cat_agree_with = 0
|
|
cat_agree_without = 0
|
|
spec_agree_with = 0
|
|
spec_agree_without = 0
|
|
both_agree_with = 0
|
|
both_agree_without = 0
|
|
|
|
for pid, lbls in human_by_pid.items():
|
|
if len(lbls) < 3:
|
|
continue
|
|
names = [l["annotatorName"] for l in lbls]
|
|
if outlier not in names:
|
|
continue
|
|
outlier_participated += 1
|
|
|
|
cats_all = [l["contentCategory"] for l in lbls]
|
|
specs_all = [l["specificityLevel"] for l in lbls]
|
|
cats_excl = [l["contentCategory"] for l in lbls if l["annotatorName"] != outlier]
|
|
specs_excl = [l["specificityLevel"] for l in lbls if l["annotatorName"] != outlier]
|
|
|
|
cat_u_all = len(set(cats_all)) == 1
|
|
cat_u_excl = len(set(cats_excl)) == 1
|
|
spec_u_all = len(set(specs_all)) == 1
|
|
spec_u_excl = len(set(specs_excl)) == 1
|
|
|
|
if cat_u_all: cat_agree_with += 1
|
|
if cat_u_excl: cat_agree_without += 1
|
|
if spec_u_all: spec_agree_with += 1
|
|
if spec_u_excl: spec_agree_without += 1
|
|
if cat_u_all and spec_u_all: both_agree_with += 1
|
|
if cat_u_excl and spec_u_excl: both_agree_without += 1
|
|
|
|
n = outlier_participated
|
|
metrics_labels = ["Category\nUnanimous", "Specificity\nUnanimous", "Both\nUnanimous"]
|
|
with_vals = [cat_agree_with / n * 100, spec_agree_with / n * 100, both_agree_with / n * 100]
|
|
without_vals = [cat_agree_without / n * 100, spec_agree_without / n * 100, both_agree_without / n * 100]
|
|
|
|
ax = axes[0]
|
|
x = np.arange(3)
|
|
width = 0.35
|
|
ax.bar(x - width/2, with_vals, width, label=f"All 3 annotators", color="#e74c3c")
|
|
ax.bar(x + width/2, without_vals, width, label=f"Excluding {outlier}", color="#2ecc71")
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(metrics_labels)
|
|
ax.set_ylabel("% of paragraphs")
|
|
ax.set_title(f"Agreement on {outlier}'s paragraphs (n={n})", fontweight="bold")
|
|
ax.legend()
|
|
|
|
for i, (w, wo) in enumerate(zip(with_vals, without_vals)):
|
|
delta = wo - w
|
|
ax.text(i, max(w, wo) + 2, f"Δ={delta:+.1f}pp", ha="center", fontsize=9, fontweight="bold")
|
|
|
|
# Chart 2: kappa distributions with/without
|
|
ax = axes[1]
|
|
kappas_with = [p["kappa"] for p in cat_kappas]
|
|
kappas_without = [p["kappa"] for p in cat_kappas if outlier not in (p["a1"], p["a2"])]
|
|
|
|
positions = [1, 2]
|
|
bp = ax.boxplot([kappas_with, kappas_without], positions=positions, widths=0.5,
|
|
patch_artist=True)
|
|
bp["boxes"][0].set_facecolor("#e74c3c")
|
|
bp["boxes"][0].set_alpha(0.5)
|
|
bp["boxes"][1].set_facecolor("#2ecc71")
|
|
bp["boxes"][1].set_alpha(0.5)
|
|
|
|
ax.set_xticks(positions)
|
|
ax.set_xticklabels(["All pairs", f"Excl. {outlier}"])
|
|
ax.set_ylabel("Cohen's κ (category)")
|
|
ax.set_title("Kappa Distribution", fontweight="bold")
|
|
|
|
# Add individual points
|
|
for pos, kappas in zip(positions, [kappas_with, kappas_without]):
|
|
jitter = np.random.normal(0, 0.04, len(kappas))
|
|
ax.scatter([pos + j for j in jitter], kappas, alpha=0.6, s=30, color="black", zorder=3)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(CHART_DIR / "16_with_without_outlier.png", dpi=150)
|
|
plt.close(fig)
|
|
print(" 16_with_without_outlier.png")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# TEXTUAL ANALYSIS OUTPUT
|
|
# ═══════════════════════════════════════════════════════════
|
|
def print_analysis():
|
|
print("\n" + "=" * 70)
|
|
print("CROSS-SOURCE ANALYSIS")
|
|
print("=" * 70)
|
|
|
|
# Human majority vs Stage1 majority vs Opus — category
|
|
h_eq_s1 = sum(1 for c in consensus.values()
|
|
if c["human_cat_maj"] and c["s1_cat_maj"] and c["human_cat_maj"] == c["s1_cat_maj"])
|
|
h_eq_op = sum(1 for c in consensus.values()
|
|
if c["human_cat_maj"] and c["opus_cat"] and c["human_cat_maj"] == c["opus_cat"])
|
|
s1_eq_op = sum(1 for c in consensus.values()
|
|
if c["s1_cat_maj"] and c["opus_cat"] and c["s1_cat_maj"] == c["opus_cat"])
|
|
|
|
# Count where all exist
|
|
n_with_all_cat = sum(1 for c in consensus.values()
|
|
if c["human_cat_maj"] and c["s1_cat_maj"] and c["opus_cat"])
|
|
n_with_hmaj = sum(1 for c in consensus.values() if c["human_cat_maj"])
|
|
n_with_s1maj = sum(1 for c in consensus.values() if c["s1_cat_maj"])
|
|
|
|
print(f"\n── Category Agreement Rates ──")
|
|
print(f" Human maj = Stage1 maj: {h_eq_s1}/{n_with_hmaj} ({h_eq_s1/n_with_hmaj*100:.1f}%)")
|
|
if OPUS_AVAILABLE:
|
|
n_with_opus_and_hmaj = sum(1 for c in consensus.values()
|
|
if c["human_cat_maj"] and c["opus_cat"])
|
|
n_with_opus_and_s1 = sum(1 for c in consensus.values()
|
|
if c["s1_cat_maj"] and c["opus_cat"])
|
|
if n_with_opus_and_hmaj > 0:
|
|
print(f" Human maj = Opus: {h_eq_op}/{n_with_opus_and_hmaj} ({h_eq_op/n_with_opus_and_hmaj*100:.1f}%)")
|
|
if n_with_opus_and_s1 > 0:
|
|
print(f" Stage1 maj = Opus: {s1_eq_op}/{n_with_opus_and_s1} ({s1_eq_op/n_with_opus_and_s1*100:.1f}%)")
|
|
else:
|
|
print(f" (Opus comparison skipped — only {opus_coverage}/1200 matched)")
|
|
|
|
# Specificity
|
|
h_eq_s1_spec = sum(1 for c in consensus.values()
|
|
if c["human_spec_maj"] is not None and c["s1_spec_maj"] is not None
|
|
and c["human_spec_maj"] == c["s1_spec_maj"])
|
|
|
|
n_h_spec = sum(1 for c in consensus.values() if c["human_spec_maj"] is not None)
|
|
|
|
print(f"\n── Specificity Agreement Rates ──")
|
|
print(f" Human maj = Stage1 maj: {h_eq_s1_spec}/{n_h_spec} ({h_eq_s1_spec/n_h_spec*100:.1f}%)")
|
|
|
|
# Disagreement patterns between human and Stage1
|
|
print(f"\n── Disagreement Patterns (Human vs Stage1) ──")
|
|
human_unan_s1_agrees = 0
|
|
human_unan_s1_differs = 0
|
|
s1_unan_human_agrees = 0
|
|
s1_unan_human_differs = 0
|
|
for c in consensus.values():
|
|
hm = c["human_cat_maj"]
|
|
sm = c["s1_cat_maj"]
|
|
hu = c["human_cat_unanimous"]
|
|
su = c["s1_cat_unanimous"]
|
|
if hm and sm:
|
|
if hu and su:
|
|
if hm == sm:
|
|
human_unan_s1_agrees += 1
|
|
else:
|
|
human_unan_s1_differs += 1
|
|
|
|
print(f" Both unanimous, agree: {human_unan_s1_agrees}")
|
|
print(f" Both unanimous, DIFFER: {human_unan_s1_differs}")
|
|
|
|
# Where do the majorities differ? Top confusion axes
|
|
human_s1_confusion = Counter()
|
|
for c in consensus.values():
|
|
hm = c["human_cat_maj"]
|
|
sm = c["s1_cat_maj"]
|
|
if hm and sm and hm != sm:
|
|
axis = tuple(sorted([hm, sm]))
|
|
human_s1_confusion[axis] += 1
|
|
|
|
if human_s1_confusion:
|
|
print(f"\n Top Human↔Stage1 disagreement axes:")
|
|
for (a, b), count in human_s1_confusion.most_common(8):
|
|
print(f" {CAT_MAP[a]}↔{CAT_MAP[b]}: {count}")
|
|
|
|
# Paragraphs with NO majority on any source
|
|
no_human_maj = sum(1 for c in consensus.values() if c["human_cat_maj"] is None)
|
|
no_s1_maj = sum(1 for c in consensus.values() if c["s1_cat_maj"] is None)
|
|
print(f"\n── 3-way splits (no majority) ──")
|
|
print(f" Human: {no_human_maj} paragraphs")
|
|
print(f" Stage1: {no_s1_maj} paragraphs")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# Run all
|
|
# ═══════════════════════════════════════════════════════════
|
|
print("\nGenerating charts...")
|
|
plot_kappa_heatmaps()
|
|
plot_annotator_category_dist()
|
|
plot_annotator_spec_dist()
|
|
plot_human_confusion()
|
|
plot_cross_source_confusion()
|
|
plot_cross_source_specificity()
|
|
plot_annotator_vs_references()
|
|
plot_agreement_by_wordcount()
|
|
plot_time_vs_agreement()
|
|
plot_none_other_analysis()
|
|
plot_outlier_annotator()
|
|
plot_human_vs_genai_consensus()
|
|
plot_specificity_bias()
|
|
plot_disagreement_axes()
|
|
plot_quiz_vs_quality()
|
|
plot_with_without_outlier()
|
|
print_analysis()
|
|
print(f"\nAll charts saved to {CHART_DIR}/")
|