SEC-cyBERT/scripts/analyze-gold.py

"""
Comprehensive analysis of human labeling data cross-referenced with
Stage 1 GenAI panel and Opus golden labels.

Outputs charts to data/gold/charts/ and a summary to stdout.
"""

import json
import os
from collections import Counter, defaultdict
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np

# ── Paths ──
GOLD_DIR = Path("/home/joey/Documents/sec-cyBERT/data/gold")
CHART_DIR = GOLD_DIR / "charts"
STAGE1_PATH = Path("/home/joey/Documents/sec-cyBERT/data/annotations/stage1.patched.jsonl")
OPUS_PATH = Path("/home/joey/Documents/sec-cyBERT/data/annotations/golden/opus.jsonl")
HOLDOUT_PATH = GOLD_DIR / "paragraphs-holdout.jsonl"
LABELS_PATH = GOLD_DIR / "human-labels-raw.jsonl"
METRICS_PATH = GOLD_DIR / "metrics.json"
OPUS_ID_MAP_PATH = GOLD_DIR / "opus-to-db-id-map.json"

CATEGORIES = [
    "Board Governance", "Management Role", "Risk Management Process",
    "Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
]
CAT_SHORT = ["BG", "MR", "RMP", "TPR", "ID", "SI", "N/O"]
CAT_MAP = dict(zip(CATEGORIES, CAT_SHORT))
SPEC_LEVELS = [1, 2, 3, 4]

CHART_DIR.mkdir(parents=True, exist_ok=True)


def load_jsonl(path: Path) -> list[dict]:
    records = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records


def majority_vote(items: list[str]) -> str | None:
    """Return majority item if one exists, else None."""
    c = Counter(items)
    top, count = c.most_common(1)[0]
    return top if count > len(items) / 2 else None


def plurality_vote(items: list) -> tuple:
    """Return most common item and its count."""
    c = Counter(items)
    return c.most_common(1)[0]


# ── Load data ──
print("Loading data...")
human_labels = load_jsonl(LABELS_PATH)
paragraphs_all = load_jsonl(HOLDOUT_PATH)
opus_labels = load_jsonl(OPUS_PATH)
metrics = json.loads(METRICS_PATH.read_text())

# Build paragraph metadata lookup (only holdout ones)
holdout_ids = {l["paragraphId"] for l in human_labels}
para_meta = {}
for p in paragraphs_all:
    if p["id"] in holdout_ids:
        para_meta[p["id"]] = p

# Load Stage 1 annotations for holdout
stage1_annots = []
with open(STAGE1_PATH) as f:
    for line in f:
        d = json.loads(line)
        if d["paragraphId"] in holdout_ids:
            stage1_annots.append(d)

# Build lookups
# Opus labels: only use if we have sufficient coverage (>50% of holdout)
# The Opus golden run may have been done on a different sample than what's in the DB.
opus_by_pid: dict[str, dict] = {}
for r in opus_labels:
    if r["paragraphId"] in holdout_ids:
        opus_by_pid[r["paragraphId"]] = r
# Also try ID remapping if direct match is low
if len(opus_by_pid) < 600 and OPUS_ID_MAP_PATH.exists():
    opus_id_map = json.loads(OPUS_ID_MAP_PATH.read_text())
    for r in opus_labels:
        db_pid = opus_id_map.get(r["paragraphId"])
        if db_pid and db_pid in holdout_ids and db_pid not in opus_by_pid:
            opus_by_pid[db_pid] = r

OPUS_AVAILABLE = len(opus_by_pid) >= 600  # gate all Opus analysis on sufficient coverage
opus_coverage = len(opus_by_pid)
print(f"  Opus labels matched to holdout: {opus_coverage}/1200"
      f" {'— SKIPPING Opus analysis (insufficient coverage)' if not OPUS_AVAILABLE else ''}")

# Stage 1: 3 annotations per paragraph
stage1_by_pid: dict[str, list[dict]] = defaultdict(list)
for a in stage1_annots:
    stage1_by_pid[a["paragraphId"]].append(a)

# Human labels grouped by paragraph
human_by_pid: dict[str, list[dict]] = defaultdict(list)
for l in human_labels:
    human_by_pid[l["paragraphId"]].append(l)

# Annotator names
annotator_names = sorted({l["annotatorName"] for l in human_labels})
annotator_ids = sorted({l["annotatorId"] for l in human_labels})
name_to_id = {}
for l in human_labels:
    name_to_id[l["annotatorName"]] = l["annotatorId"]

print(f"  {len(human_labels)} human labels across {len(holdout_ids)} paragraphs")
print(f"  {len(stage1_annots)} Stage 1 annotations")
print(f"  {len(opus_labels)} Opus labels")
print(f"  Annotators: {', '.join(annotator_names)}")

# ── Derive per-paragraph consensus labels ──
consensus = {}  # pid -> {human_cat, human_spec, human_cat_method, ...}
for pid, lbls in human_by_pid.items():
    cats = [l["contentCategory"] for l in lbls]
    specs = [l["specificityLevel"] for l in lbls]

    cat_maj = majority_vote(cats)
    spec_maj = majority_vote([str(s) for s in specs])

    # Stage 1
    s1 = stage1_by_pid.get(pid, [])
    s1_cats = [a["label"]["content_category"] for a in s1]
    s1_specs = [a["label"]["specificity_level"] for a in s1]
    s1_cat_maj = majority_vote(s1_cats) if s1_cats else None
    s1_spec_maj = majority_vote([str(s) for s in s1_specs]) if s1_specs else None

    # Opus
    op = opus_by_pid.get(pid)
    op_cat = op["label"]["content_category"] if op else None
    op_spec = op["label"]["specificity_level"] if op else None

    consensus[pid] = {
        "human_cats": cats,
        "human_specs": specs,
        "human_cat_maj": cat_maj,
        "human_spec_maj": int(spec_maj) if spec_maj else None,
        "human_cat_unanimous": len(set(cats)) == 1,
        "human_spec_unanimous": len(set(specs)) == 1,
        "s1_cats": s1_cats,
        "s1_specs": s1_specs,
        "s1_cat_maj": s1_cat_maj,
        "s1_spec_maj": int(s1_spec_maj) if s1_spec_maj else None,
        "s1_cat_unanimous": len(set(s1_cats)) == 1 if s1_cats else False,
        "opus_cat": op_cat,
        "opus_spec": op_spec,
        "word_count": para_meta.get(pid, {}).get("wordCount", 0),
    }


# ═══════════════════════════════════════════════════════════
# CHART 1: Pairwise Kappa Heatmaps (category + specificity)
# ═══════════════════════════════════════════════════════════
def plot_kappa_heatmaps():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5.5))

    for ax, dim_key, title in [
        (ax1, "category", "Category"),
        (ax2, "specificity", "Specificity"),
    ]:
        data = metrics["pairwiseKappa"][dim_key]
        names = data["annotators"]
        matrix = np.array(data["matrix"])

        # Mask diagonal
        mask = np.eye(len(names), dtype=bool)
        display = np.where(mask, np.nan, matrix)

        im = ax.imshow(display, cmap="RdYlGn", vmin=0, vmax=1, aspect="equal")
        ax.set_xticks(range(len(names)))
        ax.set_xticklabels(names, rotation=45, ha="right", fontsize=9)
        ax.set_yticks(range(len(names)))
        ax.set_yticklabels(names, fontsize=9)
        ax.set_title(f"Pairwise Cohen's κ — {title}", fontsize=12, fontweight="bold")

        for i in range(len(names)):
            for j in range(len(names)):
                if i != j:
                    color = "white" if matrix[i][j] < 0.4 else "black"
                    ax.text(j, i, f"{matrix[i][j]:.2f}", ha="center", va="center",
                            fontsize=8, color=color)

    fig.colorbar(im, ax=[ax1, ax2], shrink=0.8, label="Cohen's κ")
    fig.tight_layout()
    fig.savefig(CHART_DIR / "01_kappa_heatmaps.png", dpi=150)
    plt.close(fig)
    print("  01_kappa_heatmaps.png")


# ═══════════════════════════════════════════════════════════
# CHART 2: Per-annotator category distribution
# ═══════════════════════════════════════════════════════════
def plot_annotator_category_dist():
    fig, ax = plt.subplots(figsize=(12, 6))

    # Also add Stage 1 majority (and Opus if available)
    sources = list(annotator_names) + ["Stage1 Maj"] + (["Opus"] if OPUS_AVAILABLE else [])

    dist = {s: Counter() for s in sources}
    for l in human_labels:
        dist[l["annotatorName"]][l["contentCategory"]] += 1

    for pid, c in consensus.items():
        if c["s1_cat_maj"]:
            dist["Stage1 Maj"][c["s1_cat_maj"]] += 1
        if OPUS_AVAILABLE and c["opus_cat"]:
            dist["Opus"][c["opus_cat"]] += 1

    x = np.arange(len(sources))
    width = 0.11
    offsets = np.arange(len(CATEGORIES)) - len(CATEGORIES) / 2 + 0.5

    colors = plt.cm.Set2(np.linspace(0, 1, len(CATEGORIES)))

    for i, (cat, color) in enumerate(zip(CATEGORIES, colors)):
        counts = [dist[s].get(cat, 0) for s in sources]
        totals = [sum(dist[s].values()) for s in sources]
        pcts = [c / t * 100 if t > 0 else 0 for c, t in zip(counts, totals)]
        ax.bar(x + offsets[i] * width, pcts, width, label=CAT_MAP[cat], color=color)

    ax.set_xticks(x)
    ax.set_xticklabels(sources, rotation=45, ha="right")
    ax.set_ylabel("% of labels")
    ax.set_title("Category Distribution by Annotator (incl. Stage1 & Opus)", fontweight="bold")
    ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8)
    ax.yaxis.set_major_formatter(mticker.PercentFormatter())
    fig.tight_layout()
    fig.savefig(CHART_DIR / "02_category_distribution.png", dpi=150)
    plt.close(fig)
    print("  02_category_distribution.png")


# ═══════════════════════════════════════════════════════════
# CHART 3: Per-annotator specificity distribution
# ═══════════════════════════════════════════════════════════
def plot_annotator_spec_dist():
    fig, ax = plt.subplots(figsize=(12, 5))

    sources = list(annotator_names) + ["Stage1 Maj"] + (["Opus"] if OPUS_AVAILABLE else [])

    dist = {s: Counter() for s in sources}
    for l in human_labels:
        dist[l["annotatorName"]][l["specificityLevel"]] += 1

    for pid, c in consensus.items():
        if c["s1_spec_maj"]:
            dist["Stage1 Maj"][c["s1_spec_maj"]] += 1
        if OPUS_AVAILABLE and c["opus_spec"]:
            dist["Opus"][c["opus_spec"]] += 1

    x = np.arange(len(sources))
    width = 0.18
    colors = ["#e74c3c", "#f39c12", "#2ecc71", "#3498db"]
    spec_labels = ["1 Generic", "2 Sector", "3 Firm-Specific", "4 Quantified"]

    for i, (level, color, label) in enumerate(zip(SPEC_LEVELS, colors, spec_labels)):
        counts = [dist[s].get(level, 0) for s in sources]
        totals = [sum(dist[s].values()) for s in sources]
        pcts = [c / t * 100 if t > 0 else 0 for c, t in zip(counts, totals)]
        ax.bar(x + (i - 1.5) * width, pcts, width, label=label, color=color)

    ax.set_xticks(x)
    ax.set_xticklabels(sources, rotation=45, ha="right")
    ax.set_ylabel("% of labels")
    ax.set_title("Specificity Distribution by Annotator (incl. Stage1 & Opus)", fontweight="bold")
    ax.legend()
    ax.yaxis.set_major_formatter(mticker.PercentFormatter())
    fig.tight_layout()
    fig.savefig(CHART_DIR / "03_specificity_distribution.png", dpi=150)
    plt.close(fig)
    print("  03_specificity_distribution.png")


# ═══════════════════════════════════════════════════════════
# CHART 4: Human confusion matrix (aggregated pairwise)
# ═══════════════════════════════════════════════════════════
def plot_human_confusion():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Category confusion
    cat_conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
    cat_idx = {c: i for i, c in enumerate(CATEGORIES)}

    for pid, lbls in human_by_pid.items():
        cats = [l["contentCategory"] for l in lbls]
        for i in range(len(cats)):
            for j in range(i + 1, len(cats)):
                a, b = cat_idx[cats[i]], cat_idx[cats[j]]
                cat_conf[a][b] += 1
                cat_conf[b][a] += 1

    # Normalize rows
    row_sums = cat_conf.sum(axis=1, keepdims=True)
    cat_conf_norm = np.where(row_sums > 0, cat_conf / row_sums * 100, 0)

    im1 = ax1.imshow(cat_conf_norm, cmap="YlOrRd", aspect="equal")
    ax1.set_xticks(range(len(CAT_SHORT)))
    ax1.set_xticklabels(CAT_SHORT, fontsize=9)
    ax1.set_yticks(range(len(CAT_SHORT)))
    ax1.set_yticklabels(CAT_SHORT, fontsize=9)
    ax1.set_title("Human Category Confusion (row-normalized %)", fontweight="bold")
    ax1.set_xlabel("Annotator B")
    ax1.set_ylabel("Annotator A")

    for i in range(len(CAT_SHORT)):
        for j in range(len(CAT_SHORT)):
            val = cat_conf_norm[i][j]
            if val > 0.5:
                color = "white" if val > 40 else "black"
                ax1.text(j, i, f"{val:.0f}", ha="center", va="center",
                        fontsize=7, color=color)

    # Specificity confusion
    spec_conf = np.zeros((4, 4))
    for pid, lbls in human_by_pid.items():
        specs = [l["specificityLevel"] for l in lbls]
        for i in range(len(specs)):
            for j in range(i + 1, len(specs)):
                a, b = specs[i] - 1, specs[j] - 1
                spec_conf[a][b] += 1
                spec_conf[b][a] += 1

    row_sums = spec_conf.sum(axis=1, keepdims=True)
    spec_conf_norm = np.where(row_sums > 0, spec_conf / row_sums * 100, 0)

    im2 = ax2.imshow(spec_conf_norm, cmap="YlOrRd", aspect="equal")
    ax2.set_xticks(range(4))
    ax2.set_xticklabels(["Spec 1", "Spec 2", "Spec 3", "Spec 4"], fontsize=9)
    ax2.set_yticks(range(4))
    ax2.set_yticklabels(["Spec 1", "Spec 2", "Spec 3", "Spec 4"], fontsize=9)
    ax2.set_title("Human Specificity Confusion (row-normalized %)", fontweight="bold")

    for i in range(4):
        for j in range(4):
            val = spec_conf_norm[i][j]
            if val > 0.5:
                color = "white" if val > 40 else "black"
                ax2.text(j, i, f"{val:.0f}", ha="center", va="center",
                        fontsize=9, color=color)

    fig.colorbar(im1, ax=ax1, shrink=0.8)
    fig.colorbar(im2, ax=ax2, shrink=0.8)
    fig.tight_layout()
    fig.savefig(CHART_DIR / "04_human_confusion.png", dpi=150)
    plt.close(fig)
    print("  04_human_confusion.png")


# ═══════════════════════════════════════════════════════════
# CHART 5: Human majority vs Stage 1 majority vs Opus
# ═══════════════════════════════════════════════════════════
def plot_cross_source_confusion():
    comparisons = [
        ("Human Maj", "Stage1 Maj", "human_cat_maj", "s1_cat_maj"),
    ]
    if OPUS_AVAILABLE:
        comparisons += [
            ("Human Maj", "Opus", "human_cat_maj", "opus_cat"),
            ("Stage1 Maj", "Opus", "s1_cat_maj", "opus_cat"),
        ]
    ncols = len(comparisons)
    fig, axes = plt.subplots(1, ncols, figsize=(7 * ncols, 5.5))
    if ncols == 1:
        axes = [axes]

    for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
        conf = np.zeros((len(CATEGORIES), len(CATEGORIES)))
        cat_idx = {c: i for i, c in enumerate(CATEGORIES)}
        total = 0
        agree = 0

        for pid, c in consensus.items():
            a_val = c[key_a]
            b_val = c[key_b]
            if a_val and b_val:
                conf[cat_idx[a_val]][cat_idx[b_val]] += 1
                total += 1
                if a_val == b_val:
                    agree += 1

        # Normalize rows
        row_sums = conf.sum(axis=1, keepdims=True)
        conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)

        im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
        ax.set_xticks(range(len(CAT_SHORT)))
        ax.set_xticklabels(CAT_SHORT, fontsize=8)
        ax.set_yticks(range(len(CAT_SHORT)))
        ax.set_yticklabels(CAT_SHORT, fontsize=8)
        pct = agree / total * 100 if total > 0 else 0
        ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})",
                     fontweight="bold", fontsize=10)
        ax.set_ylabel(name_a)
        ax.set_xlabel(name_b)

        for i in range(len(CAT_SHORT)):
            for j in range(len(CAT_SHORT)):
                val = conf_norm[i][j]
                if val > 0.5:
                    color = "white" if val > 50 else "black"
                    ax.text(j, i, f"{val:.0f}", ha="center", va="center",
                            fontsize=7, color=color)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "05_cross_source_category.png", dpi=150)
    plt.close(fig)
    print("  05_cross_source_category.png")


# ═══════════════════════════════════════════════════════════
# CHART 6: Cross-source specificity confusion
# ═══════════════════════════════════════════════════════════
def plot_cross_source_specificity():
    comparisons = [
        ("Human Maj", "Stage1 Maj", "human_spec_maj", "s1_spec_maj"),
    ]
    if OPUS_AVAILABLE:
        comparisons += [
            ("Human Maj", "Opus", "human_spec_maj", "opus_spec"),
            ("Stage1 Maj", "Opus", "s1_spec_maj", "opus_spec"),
        ]
    ncols = len(comparisons)
    fig, axes = plt.subplots(1, ncols, figsize=(5.5 * ncols, 4.5))
    if ncols == 1:
        axes = [axes]

    for ax, (name_a, name_b, key_a, key_b) in zip(axes, comparisons):
        conf = np.zeros((4, 4))
        total = 0
        agree = 0

        for pid, c in consensus.items():
            a_val = c[key_a]
            b_val = c[key_b]
            if a_val is not None and b_val is not None:
                conf[a_val - 1][b_val - 1] += 1
                total += 1
                if a_val == b_val:
                    agree += 1

        row_sums = conf.sum(axis=1, keepdims=True)
        conf_norm = np.where(row_sums > 0, conf / row_sums * 100, 0)

        im = ax.imshow(conf_norm, cmap="YlGnBu", aspect="equal")
        ax.set_xticks(range(4))
        ax.set_xticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
        ax.set_yticks(range(4))
        ax.set_yticklabels(["S1", "S2", "S3", "S4"], fontsize=9)
        pct = agree / total * 100 if total > 0 else 0
        ax.set_title(f"{name_a} vs {name_b}\n({pct:.1f}% agree, n={total})",
                     fontweight="bold", fontsize=10)
        ax.set_ylabel(name_a)
        ax.set_xlabel(name_b)

        for i in range(4):
            for j in range(4):
                val = conf_norm[i][j]
                if val > 0.5:
                    color = "white" if val > 50 else "black"
                    ax.text(j, i, f"{val:.0f}", ha="center", va="center",
                            fontsize=9, color=color)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "06_cross_source_specificity.png", dpi=150)
    plt.close(fig)
    print("  06_cross_source_specificity.png")


# ═══════════════════════════════════════════════════════════
# CHART 7: Per-annotator agreement with Stage1 and Opus
# ═══════════════════════════════════════════════════════════
def plot_annotator_vs_references():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Build per-annotator label lookup
    ann_labels: dict[str, dict[str, dict]] = defaultdict(dict)
    for l in human_labels:
        ann_labels[l["annotatorName"]][l["paragraphId"]] = l

    for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "spec", "Specificity")]:
        ref_sources = [
            ("Stage1 Maj", "s1_cat_maj", "s1_spec_maj"),
            ("Human Maj", "human_cat_maj", "human_spec_maj"),
        ]
        if OPUS_AVAILABLE:
            ref_sources.insert(1, ("Opus", "opus_cat", "opus_spec"))

        x = np.arange(len(annotator_names))
        width = 0.25 if len(ref_sources) == 3 else 0.3

        for ri, (ref_name, ref_key_cat, ref_key_spec) in enumerate(ref_sources):
            rates = []
            for ann_name in annotator_names:
                agree = 0
                total = 0
                for pid, lbl in ann_labels[ann_name].items():
                    c = consensus.get(pid)
                    if not c:
                        continue
                    if dim == "cat":
                        ref_val = c[ref_key_cat]
                        ann_val = lbl["contentCategory"]
                    else:
                        ref_val = c[ref_key_spec]
                        ann_val = lbl["specificityLevel"]
                    if ref_val is not None:
                        total += 1
                        if str(ann_val) == str(ref_val):
                            agree += 1
                rates.append(agree / total * 100 if total > 0 else 0)

            ax.bar(x + (ri - 1) * width, rates, width, label=ref_name)

        ax.set_xticks(x)
        ax.set_xticklabels(annotator_names, rotation=45, ha="right")
        ax.set_ylabel("Agreement %")
        ax.set_title(f"Per-Annotator {title} Agreement with References", fontweight="bold")
        ax.legend()
        ax.set_ylim(0, 100)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "07_annotator_vs_references.png", dpi=150)
    plt.close(fig)
    print("  07_annotator_vs_references.png")


# ═══════════════════════════════════════════════════════════
# CHART 8: Agreement rate by word count (binned)
# ═══════════════════════════════════════════════════════════
def plot_agreement_by_wordcount():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Bin paragraphs by word count
    wc_bins = [(0, 50), (51, 80), (81, 120), (121, 180), (181, 500)]
    bin_labels = ["≤50", "51-80", "81-120", "121-180", "180+"]

    for ax, dim, title in [(ax1, "cat", "Category"), (ax2, "both", "Both")]:
        rates = []
        ns = []
        for lo, hi in wc_bins:
            agree = 0
            total = 0
            for pid, c in consensus.items():
                wc = c["word_count"]
                if lo <= wc <= hi:
                    total += 1
                    if dim == "cat":
                        if c["human_cat_unanimous"]:
                            agree += 1
                    else:
                        if c["human_cat_unanimous"] and c["human_spec_unanimous"]:
                            agree += 1
            rates.append(agree / total * 100 if total > 0 else 0)
            ns.append(total)

        bars = ax.bar(range(len(bin_labels)), rates, color="#3498db")
        ax.set_xticks(range(len(bin_labels)))
        ax.set_xticklabels(bin_labels)
        ax.set_xlabel("Word Count")
        ax.set_ylabel("Unanimous Agreement %")
        ax.set_title(f"{title} Consensus by Paragraph Length", fontweight="bold")
        ax.set_ylim(0, 80)

        for bar, n in zip(bars, ns):
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
                    f"n={n}", ha="center", va="bottom", fontsize=8)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "08_agreement_by_wordcount.png", dpi=150)
    plt.close(fig)
    print("  08_agreement_by_wordcount.png")


# ═══════════════════════════════════════════════════════════
# CHART 9: Active time vs agreement
# ═══════════════════════════════════════════════════════════
def plot_time_vs_agreement():
    fig, ax = plt.subplots(figsize=(10, 5))

    # For each paragraph, compute median active time and whether humans agreed
    agreed_times = []
    disagreed_times = []

    for pid, lbls in human_by_pid.items():
        times = [l["activeMs"] for l in lbls if l["activeMs"] is not None]
        if not times:
            continue
        med_time = sorted(times)[len(times) // 2] / 1000  # seconds

        cats = [l["contentCategory"] for l in lbls]
        if len(set(cats)) == 1:
            agreed_times.append(med_time)
        else:
            disagreed_times.append(med_time)

    bins = np.linspace(0, 120, 30)
    ax.hist(agreed_times, bins=bins, alpha=0.6, label=f"Category agreed (n={len(agreed_times)})",
            color="#2ecc71", density=True)
    ax.hist(disagreed_times, bins=bins, alpha=0.6, label=f"Category disagreed (n={len(disagreed_times)})",
            color="#e74c3c", density=True)
    ax.set_xlabel("Median Active Time per Paragraph (seconds)")
    ax.set_ylabel("Density")
    ax.set_title("Labeling Time: Agreed vs Disagreed Paragraphs", fontweight="bold")
    ax.legend()
    ax.set_xlim(0, 120)
    fig.tight_layout()
    fig.savefig(CHART_DIR / "09_time_vs_agreement.png", dpi=150)
    plt.close(fig)
    print("  09_time_vs_agreement.png")


# ═══════════════════════════════════════════════════════════
# CHART 10: None/Other deep dive — what do people label instead?
# ═══════════════════════════════════════════════════════════
def plot_none_other_analysis():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # For paragraphs where at least one annotator said None/Other
    # What did the others say?
    noneother_vs = Counter()
    noneother_pids = set()
    for pid, lbls in human_by_pid.items():
        cats = [l["contentCategory"] for l in lbls]
        if "None/Other" in cats and len(set(cats)) > 1:
            noneother_pids.add(pid)
            for c in cats:
                if c != "None/Other":
                    noneother_vs[c] += 1

    # Also: paragraphs where NO human said None/Other but Stage1 or Opus did
    s1_noneother_human_not = Counter()
    for pid, c in consensus.items():
        human_cats = set(c["human_cats"])
        if "None/Other" not in human_cats:
            if c["s1_cat_maj"] == "None/Other":
                for hc in c["human_cats"]:
                    s1_noneother_human_not[hc] += 1

    cats_sorted = sorted(noneother_vs.keys(), key=lambda c: -noneother_vs[c])
    ax1.barh(range(len(cats_sorted)), [noneother_vs[c] for c in cats_sorted], color="#e74c3c")
    ax1.set_yticks(range(len(cats_sorted)))
    ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted])
    ax1.set_xlabel("Count")
    ax1.set_title(f"When someone says N/O but others disagree\n({len(noneother_pids)} paragraphs)",
                  fontweight="bold")
    ax1.invert_yaxis()

    # What does Stage1 say when humans disagree on category?
    s1_for_disagreed = Counter()
    for pid, c in consensus.items():
        if not c["human_cat_unanimous"] and c["s1_cat_maj"]:
            s1_for_disagreed[c["s1_cat_maj"]] += 1

    cats_sorted2 = sorted(s1_for_disagreed.keys(), key=lambda c: -s1_for_disagreed[c])
    ax2.barh(range(len(cats_sorted2)), [s1_for_disagreed[c] for c in cats_sorted2], color="#3498db")
    ax2.set_yticks(range(len(cats_sorted2)))
    ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats_sorted2])
    ax2.set_xlabel("Count")
    ax2.set_title(f"Stage1 majority for human-disagreed paragraphs\n(n={sum(s1_for_disagreed.values())})",
                  fontweight="bold")
    ax2.invert_yaxis()

    fig.tight_layout()
    fig.savefig(CHART_DIR / "10_none_other_analysis.png", dpi=150)
    plt.close(fig)
    print("  10_none_other_analysis.png")


# ═══════════════════════════════════════════════════════════
# CHART 11: Aaryan vs everyone else — where does he diverge?
# ═══════════════════════════════════════════════════════════
def plot_outlier_annotator():
    # Find the annotator with lowest avg kappa
    cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
    ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
    for pair in cat_kappas:
        ann_kappa_sum[pair["a1"]]["sum"] += pair["kappa"]
        ann_kappa_sum[pair["a1"]]["n"] += 1
        ann_kappa_sum[pair["a2"]]["sum"] += pair["kappa"]
        ann_kappa_sum[pair["a2"]]["n"] += 1

    outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # What does the outlier label differently?
    # Compare outlier's category choices vs the majority of the other 2 annotators
    outlier_id = name_to_id.get(outlier, outlier)
    outlier_diverge_from = Counter()  # (outlier_cat, others_cat) pairs
    outlier_diverge_to = Counter()

    for pid, lbls in human_by_pid.items():
        outlier_lbl = None
        others = []
        for l in lbls:
            if l["annotatorName"] == outlier:
                outlier_lbl = l
            else:
                others.append(l)

        if outlier_lbl and len(others) >= 2:
            other_cats = [o["contentCategory"] for o in others]
            if other_cats[0] == other_cats[1] and other_cats[0] != outlier_lbl["contentCategory"]:
                outlier_diverge_from[other_cats[0]] += 1
                outlier_diverge_to[outlier_lbl["contentCategory"]] += 1

    # Diverge FROM (what category the others agreed on)
    cats1 = sorted(outlier_diverge_from.keys(), key=lambda c: -outlier_diverge_from[c])
    ax1.barh(range(len(cats1)), [outlier_diverge_from[c] for c in cats1], color="#e74c3c")
    ax1.set_yticks(range(len(cats1)))
    ax1.set_yticklabels([CAT_MAP.get(c, c) for c in cats1])
    ax1.set_xlabel("Count")
    ax1.set_title(f"{outlier} disagrees: what others chose\n(others agreed, {outlier} didn't)",
                  fontweight="bold")
    ax1.invert_yaxis()

    # Diverge TO (what did the outlier pick instead)
    cats2 = sorted(outlier_diverge_to.keys(), key=lambda c: -outlier_diverge_to[c])
    ax2.barh(range(len(cats2)), [outlier_diverge_to[c] for c in cats2], color="#f39c12")
    ax2.set_yticks(range(len(cats2)))
    ax2.set_yticklabels([CAT_MAP.get(c, c) for c in cats2])
    ax2.set_xlabel("Count")
    ax2.set_title(f"What {outlier} chose instead", fontweight="bold")
    ax2.invert_yaxis()

    fig.suptitle(f"Outlier Analysis: {outlier} (lowest avg κ = "
                 f"{ann_kappa_sum[outlier]['sum']/ann_kappa_sum[outlier]['n']:.3f})",
                 fontweight="bold", fontsize=12)
    fig.tight_layout()
    fig.savefig(CHART_DIR / "11_outlier_annotator.png", dpi=150)
    plt.close(fig)
    print("  11_outlier_annotator.png")


# ═══════════════════════════════════════════════════════════
# CHART 12: Human vs GenAI consensus comparison
# ═══════════════════════════════════════════════════════════
def plot_human_vs_genai_consensus():
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))

    # For each paragraph: human unanimity vs stage1 unanimity
    # Quadrants: both agree, human only, stage1 only, neither
    human_unan_cat = sum(1 for c in consensus.values() if c["human_cat_unanimous"])
    s1_unan_cat = sum(1 for c in consensus.values() if c["s1_cat_unanimous"])
    both_unan_cat = sum(1 for c in consensus.values()
                        if c["human_cat_unanimous"] and c["s1_cat_unanimous"])

    human_unan_spec = sum(1 for c in consensus.values() if c["human_spec_unanimous"])
    s1_unan_spec = sum(1 for c in consensus.values()
                       if len(set(c["s1_specs"])) == 1 if c["s1_specs"])

    # Chart 1: Category agreement Venn-style comparison
    ax = axes[0]
    labels_data = ["Human\nunanimous", "Stage1\nunanimous", "Both\nunanimous"]
    vals = [human_unan_cat, s1_unan_cat, both_unan_cat]
    pcts = [v / 1200 * 100 for v in vals]
    bars = ax.bar(range(3), pcts, color=["#3498db", "#e74c3c", "#2ecc71"])
    ax.set_xticks(range(3))
    ax.set_xticklabels(labels_data)
    ax.set_ylabel("%")
    ax.set_title("Category Unanimity Rates", fontweight="bold")
    for bar, v, p in zip(bars, vals, pcts):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
                f"{p:.1f}%\n({v})", ha="center", fontsize=9)

    # Chart 2: Human vs Stage1 category agreement breakdown
    ax = axes[1]
    both_agree = 0  # human unanimous AND matches s1
    human_unan_s1_diff = 0  # human unanimous but s1 differs
    s1_unan_human_diff = 0  # s1 unanimous but human majority differs
    both_majority_agree = 0  # neither unanimous but majorities match
    majorities_differ = 0

    for pid, c in consensus.items():
        hm = c["human_cat_maj"]
        sm = c["s1_cat_maj"]
        hu = c["human_cat_unanimous"]
        su = c["s1_cat_unanimous"]
        if not hm or not sm:
            continue
        if hm == sm:
            both_majority_agree += 1
        else:
            majorities_differ += 1

    total = both_majority_agree + majorities_differ
    vals = [both_majority_agree, majorities_differ]
    pcts = [v / total * 100 for v in vals]
    labels_d = ["Majorities\nagree", "Majorities\ndiffer"]
    colors_d = ["#2ecc71", "#e74c3c"]
    bars = ax.bar(range(2), pcts, color=colors_d)
    ax.set_xticks(range(2))
    ax.set_xticklabels(labels_d)
    ax.set_ylabel("%")
    ax.set_title(f"Human vs Stage1 Category Agreement\n(n={total})", fontweight="bold")
    for bar, v, p in zip(bars, vals, pcts):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
                f"{v}\n({p:.1f}%)", ha="center", fontsize=9)

    # Chart 3: Same for specificity
    ax = axes[2]
    spec_agree = 0
    spec_differ = 0
    for pid, c in consensus.items():
        hm = c["human_spec_maj"]
        sm = c["s1_spec_maj"]
        if hm is None or sm is None:
            continue
        if hm == sm:
            spec_agree += 1
        else:
            spec_differ += 1

    total = spec_agree + spec_differ
    vals = [spec_agree, spec_differ]
    pcts = [v / total * 100 for v in vals]
    bars = ax.bar(range(2), pcts, color=colors_d)
    ax.set_xticks(range(2))
    ax.set_xticklabels(labels_d)
    ax.set_ylabel("%")
    ax.set_title(f"Human vs Stage1 Specificity Agreement\n(n={total})", fontweight="bold")
    for bar, v, p in zip(bars, vals, pcts):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
                f"{v}\n({p:.1f}%)", ha="center", fontsize=9)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "12_human_vs_genai_consensus.png", dpi=150)
    plt.close(fig)
    print("  12_human_vs_genai_consensus.png")


# ═══════════════════════════════════════════════════════════
# CHART 13: Per-annotator specificity bias
# ═══════════════════════════════════════════════════════════
def plot_specificity_bias():
    fig, ax = plt.subplots(figsize=(10, 5))

    # For each annotator, compare their spec vs Opus spec
    ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
    for l in human_labels:
        ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l

    names = annotator_names
    biases = []  # mean(annotator_spec - stage1_majority_spec)
    for name in names:
        diffs = []
        for pid, lbl in ann_labels_by_name[name].items():
            c = consensus.get(pid)
            if c and c["s1_spec_maj"] is not None:
                diffs.append(lbl["specificityLevel"] - c["s1_spec_maj"])
        biases.append(np.mean(diffs) if diffs else 0)

    colors = ["#e74c3c" if b < -0.1 else "#2ecc71" if b > 0.1 else "#95a5a6" for b in biases]
    bars = ax.bar(range(len(names)), biases, color=colors)
    ax.set_xticks(range(len(names)))
    ax.set_xticklabels(names, rotation=45, ha="right")
    ax.set_ylabel("Mean (Human - Stage1 Maj) Specificity")
    ax.set_title("Specificity Bias vs Stage1 (negative = under-rates, positive = over-rates)",
                 fontweight="bold")
    ax.axhline(0, color="black", linewidth=0.5)

    for bar, b in zip(bars, biases):
        ax.text(bar.get_x() + bar.get_width() / 2,
                bar.get_height() + (0.02 if b >= 0 else -0.05),
                f"{b:+.2f}", ha="center", fontsize=9)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "13_specificity_bias.png", dpi=150)
    plt.close(fig)
    print("  13_specificity_bias.png")


# ═══════════════════════════════════════════════════════════
# CHART 14: Disagreement axes — human vs GenAI top confusions
# ═══════════════════════════════════════════════════════════
def plot_disagreement_axes():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Human disagreement axes (where 2 annotators agree, 1 disagrees)
    human_axes = Counter()
    for pid, lbls in human_by_pid.items():
        cats = [l["contentCategory"] for l in lbls]
        if len(set(cats)) == 2:
            c = Counter(cats)
            items = c.most_common()
            axis = tuple(sorted([items[0][0], items[1][0]]))
            human_axes[axis] += 1
        elif len(set(cats)) == 3:
            for i, c1 in enumerate(cats):
                for c2 in cats[i+1:]:
                    if c1 != c2:
                        axis = tuple(sorted([c1, c2]))
                        human_axes[axis] += 1

    top_human = human_axes.most_common(10)
    labels_h = [f"{CAT_MAP[a]}↔{CAT_MAP[b]}" for (a, b), _ in top_human]
    counts_h = [c for _, c in top_human]

    ax1.barh(range(len(labels_h)), counts_h, color="#e74c3c")
    ax1.set_yticks(range(len(labels_h)))
    ax1.set_yticklabels(labels_h, fontsize=9)
    ax1.set_xlabel("Disagreement count")
    ax1.set_title("Human Top Disagreement Axes", fontweight="bold")
    ax1.invert_yaxis()

    # Stage 1 disagreement axes on same paragraphs
    s1_axes = Counter()
    for pid, c in consensus.items():
        s1_cats = c["s1_cats"]
        if len(set(s1_cats)) == 2:
            cnt = Counter(s1_cats)
            items = cnt.most_common()
            axis = tuple(sorted([items[0][0], items[1][0]]))
            s1_axes[axis] += 1
        elif len(set(s1_cats)) == 3:
            for i, c1 in enumerate(s1_cats):
                for c2 in s1_cats[i+1:]:
                    if c1 != c2:
                        axis = tuple(sorted([c1, c2]))
                        s1_axes[axis] += 1

    top_s1 = s1_axes.most_common(10)
    labels_s = [f"{CAT_MAP[a]}↔{CAT_MAP[b]}" for (a, b), _ in top_s1]
    counts_s = [c for _, c in top_s1]

    ax2.barh(range(len(labels_s)), counts_s, color="#3498db")
    ax2.set_yticks(range(len(labels_s)))
    ax2.set_yticklabels(labels_s, fontsize=9)
    ax2.set_xlabel("Disagreement count")
    ax2.set_title("Stage 1 Top Disagreement Axes (same paragraphs)", fontweight="bold")
    ax2.invert_yaxis()

    fig.tight_layout()
    fig.savefig(CHART_DIR / "14_disagreement_axes.png", dpi=150)
    plt.close(fig)
    print("  14_disagreement_axes.png")


# ═══════════════════════════════════════════════════════════
# CHART 15: Quiz performance vs labeling quality
# ═══════════════════════════════════════════════════════════
def plot_quiz_vs_quality():
    fig, ax = plt.subplots(figsize=(10, 5))

    # Load quiz data
    quiz_sessions = load_jsonl(GOLD_DIR / "quiz-sessions.jsonl")

    # Best quiz score per annotator
    best_quiz: dict[str, int] = {}
    attempts: dict[str, int] = defaultdict(int)
    for q in quiz_sessions:
        name = q["annotatorName"]
        attempts[name] += 1
        if q["passed"]:
            if name not in best_quiz or q["score"] > best_quiz[name]:
                best_quiz[name] = q["score"]

    # Agreement rate with Stage1 majority per annotator
    ann_labels_by_name: dict[str, dict[str, dict]] = defaultdict(dict)
    for l in human_labels:
        ann_labels_by_name[l["annotatorName"]][l["paragraphId"]] = l

    s1_agree = {}
    for name in annotator_names:
        agree = 0
        total = 0
        for pid, lbl in ann_labels_by_name[name].items():
            c = consensus.get(pid)
            if c and c["s1_cat_maj"]:
                total += 1
                if lbl["contentCategory"] == c["s1_cat_maj"]:
                    agree += 1
        s1_agree[name] = agree / total * 100 if total > 0 else 0

    x = np.arange(len(annotator_names))
    width = 0.35
    ax.bar(x - width/2, [attempts.get(n, 0) for n in annotator_names],
           width, label="Quiz attempts", color="#f39c12")
    ax2 = ax.twinx()
    ax2.bar(x + width/2, [s1_agree.get(n, 0) for n in annotator_names],
            width, label="Category agree w/ Stage1 (%)", color="#3498db", alpha=0.7)

    ax.set_xticks(x)
    ax.set_xticklabels(annotator_names, rotation=45, ha="right")
    ax.set_ylabel("Quiz attempts", color="#f39c12")
    ax2.set_ylabel("Opus agreement %", color="#3498db")
    ax.set_title("Quiz Attempts vs Labeling Quality (Stage1 Agreement)", fontweight="bold")

    lines1, labels1 = ax.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

    fig.tight_layout()
    fig.savefig(CHART_DIR / "15_quiz_vs_quality.png", dpi=150)
    plt.close(fig)
    print("  15_quiz_vs_quality.png")


# ═══════════════════════════════════════════════════════════
# CHART 16: Aaryan-excluded metrics comparison
# ═══════════════════════════════════════════════════════════
def plot_with_without_outlier():
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Find outlier (lowest avg kappa)
    cat_kappas = metrics["pairwiseKappa"]["category"]["pairs"]
    ann_kappa_sum = defaultdict(lambda: {"sum": 0, "n": 0})
    for pair in cat_kappas:
        ann_kappa_sum[pair["a1"]]["sum"] += pair["kappa"]
        ann_kappa_sum[pair["a1"]]["n"] += 1
        ann_kappa_sum[pair["a2"]]["sum"] += pair["kappa"]
        ann_kappa_sum[pair["a2"]]["n"] += 1
    outlier = min(ann_kappa_sum, key=lambda a: ann_kappa_sum[a]["sum"] / ann_kappa_sum[a]["n"])

    # Compute consensus with and without outlier
    # For paragraphs where outlier participated
    outlier_participated = 0
    cat_agree_with = 0
    cat_agree_without = 0
    spec_agree_with = 0
    spec_agree_without = 0
    both_agree_with = 0
    both_agree_without = 0

    for pid, lbls in human_by_pid.items():
        if len(lbls) < 3:
            continue
        names = [l["annotatorName"] for l in lbls]
        if outlier not in names:
            continue
        outlier_participated += 1

        cats_all = [l["contentCategory"] for l in lbls]
        specs_all = [l["specificityLevel"] for l in lbls]
        cats_excl = [l["contentCategory"] for l in lbls if l["annotatorName"] != outlier]
        specs_excl = [l["specificityLevel"] for l in lbls if l["annotatorName"] != outlier]

        cat_u_all = len(set(cats_all)) == 1
        cat_u_excl = len(set(cats_excl)) == 1
        spec_u_all = len(set(specs_all)) == 1
        spec_u_excl = len(set(specs_excl)) == 1

        if cat_u_all: cat_agree_with += 1
        if cat_u_excl: cat_agree_without += 1
        if spec_u_all: spec_agree_with += 1
        if spec_u_excl: spec_agree_without += 1
        if cat_u_all and spec_u_all: both_agree_with += 1
        if cat_u_excl and spec_u_excl: both_agree_without += 1

    n = outlier_participated
    metrics_labels = ["Category\nUnanimous", "Specificity\nUnanimous", "Both\nUnanimous"]
    with_vals = [cat_agree_with / n * 100, spec_agree_with / n * 100, both_agree_with / n * 100]
    without_vals = [cat_agree_without / n * 100, spec_agree_without / n * 100, both_agree_without / n * 100]

    ax = axes[0]
    x = np.arange(3)
    width = 0.35
    ax.bar(x - width/2, with_vals, width, label=f"All 3 annotators", color="#e74c3c")
    ax.bar(x + width/2, without_vals, width, label=f"Excluding {outlier}", color="#2ecc71")
    ax.set_xticks(x)
    ax.set_xticklabels(metrics_labels)
    ax.set_ylabel("% of paragraphs")
    ax.set_title(f"Agreement on {outlier}'s paragraphs (n={n})", fontweight="bold")
    ax.legend()

    for i, (w, wo) in enumerate(zip(with_vals, without_vals)):
        delta = wo - w
        ax.text(i, max(w, wo) + 2, f"Δ={delta:+.1f}pp", ha="center", fontsize=9, fontweight="bold")

    # Chart 2: kappa distributions with/without
    ax = axes[1]
    kappas_with = [p["kappa"] for p in cat_kappas]
    kappas_without = [p["kappa"] for p in cat_kappas if outlier not in (p["a1"], p["a2"])]

    positions = [1, 2]
    bp = ax.boxplot([kappas_with, kappas_without], positions=positions, widths=0.5,
                    patch_artist=True)
    bp["boxes"][0].set_facecolor("#e74c3c")
    bp["boxes"][0].set_alpha(0.5)
    bp["boxes"][1].set_facecolor("#2ecc71")
    bp["boxes"][1].set_alpha(0.5)

    ax.set_xticks(positions)
    ax.set_xticklabels(["All pairs", f"Excl. {outlier}"])
    ax.set_ylabel("Cohen's κ (category)")
    ax.set_title("Kappa Distribution", fontweight="bold")

    # Add individual points
    for pos, kappas in zip(positions, [kappas_with, kappas_without]):
        jitter = np.random.normal(0, 0.04, len(kappas))
        ax.scatter([pos + j for j in jitter], kappas, alpha=0.6, s=30, color="black", zorder=3)

    fig.tight_layout()
    fig.savefig(CHART_DIR / "16_with_without_outlier.png", dpi=150)
    plt.close(fig)
    print("  16_with_without_outlier.png")


# ═══════════════════════════════════════════════════════════
# TEXTUAL ANALYSIS OUTPUT
# ═══════════════════════════════════════════════════════════
def print_analysis():
    print("\n" + "=" * 70)
    print("CROSS-SOURCE ANALYSIS")
    print("=" * 70)

    # Human majority vs Stage1 majority vs Opus — category
    h_eq_s1 = sum(1 for c in consensus.values()
                  if c["human_cat_maj"] and c["s1_cat_maj"] and c["human_cat_maj"] == c["s1_cat_maj"])
    h_eq_op = sum(1 for c in consensus.values()
                  if c["human_cat_maj"] and c["opus_cat"] and c["human_cat_maj"] == c["opus_cat"])
    s1_eq_op = sum(1 for c in consensus.values()
                   if c["s1_cat_maj"] and c["opus_cat"] and c["s1_cat_maj"] == c["opus_cat"])

    # Count where all exist
    n_with_all_cat = sum(1 for c in consensus.values()
                         if c["human_cat_maj"] and c["s1_cat_maj"] and c["opus_cat"])
    n_with_hmaj = sum(1 for c in consensus.values() if c["human_cat_maj"])
    n_with_s1maj = sum(1 for c in consensus.values() if c["s1_cat_maj"])

    print(f"\n── Category Agreement Rates ──")
    print(f"  Human maj = Stage1 maj:  {h_eq_s1}/{n_with_hmaj} ({h_eq_s1/n_with_hmaj*100:.1f}%)")
    if OPUS_AVAILABLE:
        n_with_opus_and_hmaj = sum(1 for c in consensus.values()
                                   if c["human_cat_maj"] and c["opus_cat"])
        n_with_opus_and_s1 = sum(1 for c in consensus.values()
                                  if c["s1_cat_maj"] and c["opus_cat"])
        if n_with_opus_and_hmaj > 0:
            print(f"  Human maj = Opus:        {h_eq_op}/{n_with_opus_and_hmaj} ({h_eq_op/n_with_opus_and_hmaj*100:.1f}%)")
        if n_with_opus_and_s1 > 0:
            print(f"  Stage1 maj = Opus:       {s1_eq_op}/{n_with_opus_and_s1} ({s1_eq_op/n_with_opus_and_s1*100:.1f}%)")
    else:
        print(f"  (Opus comparison skipped — only {opus_coverage}/1200 matched)")

    # Specificity
    h_eq_s1_spec = sum(1 for c in consensus.values()
                       if c["human_spec_maj"] is not None and c["s1_spec_maj"] is not None
                       and c["human_spec_maj"] == c["s1_spec_maj"])

    n_h_spec = sum(1 for c in consensus.values() if c["human_spec_maj"] is not None)

    print(f"\n── Specificity Agreement Rates ──")
    print(f"  Human maj = Stage1 maj:  {h_eq_s1_spec}/{n_h_spec} ({h_eq_s1_spec/n_h_spec*100:.1f}%)")

    # Disagreement patterns between human and Stage1
    print(f"\n── Disagreement Patterns (Human vs Stage1) ──")
    human_unan_s1_agrees = 0
    human_unan_s1_differs = 0
    s1_unan_human_agrees = 0
    s1_unan_human_differs = 0
    for c in consensus.values():
        hm = c["human_cat_maj"]
        sm = c["s1_cat_maj"]
        hu = c["human_cat_unanimous"]
        su = c["s1_cat_unanimous"]
        if hm and sm:
            if hu and su:
                if hm == sm:
                    human_unan_s1_agrees += 1
                else:
                    human_unan_s1_differs += 1

    print(f"  Both unanimous, agree:  {human_unan_s1_agrees}")
    print(f"  Both unanimous, DIFFER: {human_unan_s1_differs}")

    # Where do the majorities differ? Top confusion axes
    human_s1_confusion = Counter()
    for c in consensus.values():
        hm = c["human_cat_maj"]
        sm = c["s1_cat_maj"]
        if hm and sm and hm != sm:
            axis = tuple(sorted([hm, sm]))
            human_s1_confusion[axis] += 1

    if human_s1_confusion:
        print(f"\n  Top Human↔Stage1 disagreement axes:")
        for (a, b), count in human_s1_confusion.most_common(8):
            print(f"    {CAT_MAP[a]}↔{CAT_MAP[b]}: {count}")

    # Paragraphs with NO majority on any source
    no_human_maj = sum(1 for c in consensus.values() if c["human_cat_maj"] is None)
    no_s1_maj = sum(1 for c in consensus.values() if c["s1_cat_maj"] is None)
    print(f"\n── 3-way splits (no majority) ──")
    print(f"  Human: {no_human_maj} paragraphs")
    print(f"  Stage1: {no_s1_maj} paragraphs")


# ═══════════════════════════════════════════════════════════
# Run all
# ═══════════════════════════════════════════════════════════
print("\nGenerating charts...")
plot_kappa_heatmaps()
plot_annotator_category_dist()
plot_annotator_spec_dist()
plot_human_confusion()
plot_cross_source_confusion()
plot_cross_source_specificity()
plot_annotator_vs_references()
plot_agreement_by_wordcount()
plot_time_vs_agreement()
plot_none_other_analysis()
plot_outlier_annotator()
plot_human_vs_genai_consensus()
plot_specificity_bias()
plot_disagreement_axes()
plot_quiz_vs_quality()
plot_with_without_outlier()
print_analysis()
print(f"\nAll charts saved to {CHART_DIR}/")