SEC-cyBERT/scripts/plot-stage1-distributions.py

"""
Stage 1 (v2) distribution charts for the writeup.
Generates: category distribution, specificity distribution,
cross-run agreement, consensus method breakdown, and
specificity disagreement boundary analysis.

Usage: uvx --with matplotlib --with numpy python scripts/plot-stage1-distributions.py
"""

import json
import collections
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np

DATA = Path(__file__).resolve().parent.parent / "data"
FIGS = Path(__file__).resolve().parent.parent / "figures"
FIGS.mkdir(exist_ok=True)

# ── Color palette ────────────────────────────────────────────────────────────
CAT_COLORS = {
    "Risk Management Process": "#2196F3",
    "Board Governance": "#4CAF50",
    "Management Role": "#FF9800",
    "Strategy Integration": "#9C27B0",
    "None/Other": "#607D8B",
    "Third-Party Risk": "#F44336",
    "Incident Disclosure": "#00BCD4",
}
CAT_ABBREV = {
    "Risk Management Process": "RMP",
    "Board Governance": "BG",
    "Management Role": "MR",
    "Strategy Integration": "SI",
    "None/Other": "N/O",
    "Third-Party Risk": "TP",
    "Incident Disclosure": "ID",
}
SPEC_COLORS = ["#BDBDBD", "#64B5F6", "#FFB74D", "#EF5350"]
SPEC_LABELS = ["L1: Generic\nBoilerplate", "L2: Domain-\nAdapted", "L3: Firm-\nSpecific", "L4: Quantified-\nVerifiable"]

# ── Load data ────────────────────────────────────────────────────────────────
runs = {}
for run in [1, 2, 3]:
    path = DATA / f"annotations/v2-stage1/grok-4.1-fast.run{run}.jsonl"
    runs[run] = {}
    with open(path) as f:
        for line in f:
            r = json.loads(line)
            runs[run][r["paragraphId"]] = r["label"]

# Load judge results
judge = {}
judge_path = DATA / "annotations/v2-stage1/judge.jsonl"
if judge_path.exists():
    with open(judge_path) as f:
        for line in f:
            r = json.loads(line)
            judge[r["paragraphId"]] = r["label"]

all_ids = sorted(set(runs[1]) & set(runs[2]) & set(runs[3]))
N = len(all_ids)
print(f"Loaded {N} paragraphs across 3 runs, {len(judge)} judge results")

# ── Compute consensus labels ─────────────────────────────────────────────────
final_cats = []
final_specs = []
consensus_methods = collections.Counter()

for pid in all_ids:
    cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]]
    specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
    cat_counts = collections.Counter(cats)
    spec_counts = collections.Counter(specs)
    cat_max = max(cat_counts.values())
    spec_max = max(spec_counts.values())

    if cat_max == 3 and spec_max == 3:
        consensus_methods["Unanimous (3/3)"] += 1
        final_cats.append(cat_counts.most_common(1)[0][0])
        final_specs.append(spec_counts.most_common(1)[0][0])
    elif cat_max >= 2 and spec_max >= 2:
        consensus_methods["Majority (2/3)"] += 1
        final_cats.append(cat_counts.most_common(1)[0][0])
        final_specs.append(spec_counts.most_common(1)[0][0])
    else:
        # Judge tiebreaker
        if pid in judge:
            consensus_methods["Judge tiebreaker"] += 1
            final_cats.append(judge[pid]["content_category"])
            final_specs.append(judge[pid]["specificity_level"])
        else:
            consensus_methods["Unresolved"] += 1
            final_cats.append(cat_counts.most_common(1)[0][0])
            final_specs.append(spec_counts.most_common(1)[0][0])

plt.rcParams.update({
    "font.family": "sans-serif",
    "font.size": 11,
    "axes.titlesize": 13,
    "axes.titleweight": "bold",
    "figure.facecolor": "white",
})


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 1: Category Distribution (final consensus)
# ══════════════════════════════════════════════════════════════════════════════
cat_counts_final = collections.Counter(final_cats)
cat_order = ["Risk Management Process", "Board Governance", "Management Role",
             "Strategy Integration", "None/Other", "Third-Party Risk", "Incident Disclosure"]

fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(cat_order))
counts = [cat_counts_final[c] for c in cat_order]
colors = [CAT_COLORS[c] for c in cat_order]
bars = ax.bar(x, counts, color=colors, edgecolor="white", linewidth=0.5)

for bar, count in zip(bars, counts):
    pct = count / N * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
            f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9)

ax.set_xticks(x)
ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
ax.set_ylabel("Paragraphs")
ax.set_title("Content Category Distribution — Stage 1 Consensus (72,045 paragraphs)")
ax.set_ylim(0, max(counts) * 1.18)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-category-distribution.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-category-distribution.png")


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 2: Specificity Distribution (final consensus)
# ══════════════════════════════════════════════════════════════════════════════
spec_counts_final = collections.Counter(final_specs)

fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(4)
counts = [spec_counts_final.get(i + 1, 0) for i in range(4)]
bars = ax.bar(x, counts, color=SPEC_COLORS, edgecolor="white", linewidth=0.5)

for bar, count in zip(bars, counts):
    pct = count / N * 100
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
            f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9)

ax.set_xticks(x)
ax.set_xticklabels(SPEC_LABELS, fontsize=10)
ax.set_ylabel("Paragraphs")
ax.set_title("Specificity Distribution — Stage 1 Consensus (72,045 paragraphs)")
ax.set_ylim(0, max(counts) * 1.18)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-specificity-distribution.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-specificity-distribution.png")


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 3: Cross-run agreement (stacked bar showing unanimity rates)
# ══════════════════════════════════════════════════════════════════════════════
cat_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0}
spec_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0}

for pid in all_ids:
    cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]]
    specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
    cat_n = len(set(cats))
    spec_n = len(set(specs))

    if cat_n == 1: cat_agreement["Unanimous"] += 1
    elif cat_n == 2: cat_agreement["Majority"] += 1
    else: cat_agreement["All disagree"] += 1

    if spec_n == 1: spec_agreement["Unanimous"] += 1
    elif spec_n == 2: spec_agreement["Majority"] += 1
    else: spec_agreement["All disagree"] += 1

fig, ax = plt.subplots(figsize=(8, 5))
dims = ["Category", "Specificity"]
unanimous = [cat_agreement["Unanimous"] / N * 100, spec_agreement["Unanimous"] / N * 100]
majority = [cat_agreement["Majority"] / N * 100, spec_agreement["Majority"] / N * 100]
alldis = [cat_agreement["All disagree"] / N * 100, spec_agreement["All disagree"] / N * 100]

x = np.arange(len(dims))
w = 0.5
b1 = ax.bar(x, unanimous, w, label="Unanimous (3/3)", color="#4CAF50")
b2 = ax.bar(x, majority, w, bottom=unanimous, label="Majority (2/3)", color="#FFC107")
b3 = ax.bar(x, alldis, w, bottom=[u + m for u, m in zip(unanimous, majority)],
            label="All disagree", color="#F44336")

for i, (u, m, a) in enumerate(zip(unanimous, majority, alldis)):
    ax.text(i, u / 2, f"{u:.1f}%", ha="center", va="center", fontsize=11, fontweight="bold", color="white")
    if m > 2:
        ax.text(i, u + m / 2, f"{m:.1f}%", ha="center", va="center", fontsize=10, color="black")
    if a > 0.5:
        ax.text(i, u + m + a / 2, f"{a:.2f}%", ha="center", va="center", fontsize=8, color="white")

ax.set_xticks(x)
ax.set_xticklabels(dims, fontsize=12)
ax.set_ylabel("Percentage of paragraphs")
ax.set_title("Grok ×3 Cross-Run Agreement (72,045 paragraphs)")
ax.set_ylim(0, 105)
ax.legend(loc="upper right", fontsize=10)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-cross-run-agreement.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-cross-run-agreement.png")


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 4: Consensus method breakdown (pie/donut)
# ══════════════════════════════════════════════════════════════════════════════
fig, ax = plt.subplots(figsize=(7, 7))
method_order = ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]
method_counts = [consensus_methods.get(m, 0) for m in method_order]
method_colors = ["#4CAF50", "#FFC107", "#2196F3"]

wedges, texts, autotexts = ax.pie(
    method_counts, labels=method_order, colors=method_colors,
    autopct=lambda p: f"{p:.1f}%\n({int(round(p * N / 100)):,})",
    startangle=90, pctdistance=0.75,
    wedgeprops=dict(width=0.45, edgecolor="white", linewidth=2),
)
for t in autotexts:
    t.set_fontsize(10)
ax.set_title("Consensus Resolution Method — Stage 1")
fig.tight_layout()
fig.savefig(FIGS / "stage1-consensus-methods.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-consensus-methods.png")


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 5: Specificity boundary disagreements (where runs diverge)
# ══════════════════════════════════════════════════════════════════════════════
boundary_counts = collections.Counter()
for pid in all_ids:
    specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
    if len(set(specs)) == 1:
        continue
    low, high = min(specs), max(specs)
    boundary_counts[f"L{low}↔L{high}"] += 1

fig, ax = plt.subplots(figsize=(8, 5))
boundaries = sorted(boundary_counts.keys())
counts = [boundary_counts[b] for b in boundaries]
colors_b = ["#90CAF9" if "1↔2" in b else "#FFE082" if "2↔3" in b or "1↔3" in b
            else "#EF9A9A" if "3↔4" in b else "#CE93D8" for b in boundaries]
bars = ax.barh(boundaries, counts, color=colors_b, edgecolor="white")

for bar, count in zip(bars, counts):
    ax.text(bar.get_width() + 20, bar.get_y() + bar.get_height() / 2,
            f"{count:,} ({count / N * 100:.1f}%)", va="center", fontsize=10)

ax.set_xlabel("Paragraphs with divergent specificity")
ax.set_title("Specificity Boundary Disagreements Across 3 Grok Runs")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_xlim(0, max(counts) * 1.25)
fig.tight_layout()
fig.savefig(FIGS / "stage1-specificity-boundaries.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-specificity-boundaries.png")


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 6: Category × Specificity heatmap (final consensus)
# ══════════════════════════════════════════════════════════════════════════════
cat_spec_matrix = np.zeros((len(cat_order), 4))
for cat, spec in zip(final_cats, final_specs):
    i = cat_order.index(cat)
    cat_spec_matrix[i, spec - 1] += 1

# Normalize to row percentages
row_sums = cat_spec_matrix.sum(axis=1, keepdims=True)
cat_spec_pct = cat_spec_matrix / row_sums * 100

fig, ax = plt.subplots(figsize=(9, 6))
im = ax.imshow(cat_spec_pct, cmap="YlOrRd", aspect="auto")

for i in range(len(cat_order)):
    for j in range(4):
        count = int(cat_spec_matrix[i, j])
        pct = cat_spec_pct[i, j]
        color = "white" if pct > 50 else "black"
        ax.text(j, i, f"{count:,}\n({pct:.0f}%)", ha="center", va="center",
                fontsize=8, color=color)

ax.set_xticks(range(4))
ax.set_xticklabels(["L1", "L2", "L3", "L4"], fontsize=11)
ax.set_yticks(range(len(cat_order)))
ax.set_yticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
ax.set_xlabel("Specificity Level")
ax.set_ylabel("Content Category")
ax.set_title("Category × Specificity — Stage 1 Consensus (row %)")
fig.colorbar(im, ax=ax, label="Row %", shrink=0.8)
fig.tight_layout()
fig.savefig(FIGS / "stage1-category-specificity-heatmap.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-category-specificity-heatmap.png")


# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 7: v1 vs v2 category comparison
# ══════════════════════════════════════════════════════════════════════════════
# v1 distribution from STATUS.md (50,003 paragraphs, different base)
v1_pct = {
    "Risk Management Process": 45.8,
    "Management Role": 17.6,
    "Board Governance": 16.0,
    "Strategy Integration": 10.0,
    "None/Other": 5.0,
    "Third-Party Risk": 5.0,
    "Incident Disclosure": 0.6,
}
v2_pct = {c: cat_counts_final[c] / N * 100 for c in cat_order}

fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(cat_order))
w = 0.35
b1 = ax.bar(x - w / 2, [v1_pct[c] for c in cat_order], w, label="v1 (50K, 3-model panel)",
            color="#90CAF9", edgecolor="white")
b2 = ax.bar(x + w / 2, [v2_pct[c] for c in cat_order], w, label="v2 (72K, Grok ×3)",
            color="#2196F3", edgecolor="white")

for bar_group in [b1, b2]:
    for bar in bar_group:
        h = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, h + 0.3,
                f"{h:.1f}%", ha="center", va="bottom", fontsize=8)

ax.set_xticks(x)
ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
ax.set_ylabel("Percentage")
ax.set_title("Category Distribution: v1 vs v2 Stage 1")
ax.legend(fontsize=10)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-v1-vs-v2-categories.png", dpi=200)
plt.close(fig)
print("  ✓ stage1-v1-vs-v2-categories.png")


# ── Print summary stats ──────────────────────────────────────────────────────
print(f"\n{'═' * 60}")
print(f"Stage 1 Consensus Summary")
print(f"{'═' * 60}")
print(f"Total paragraphs: {N:,}")
print(f"\nConsensus methods:")
for m in ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]:
    c = consensus_methods.get(m, 0)
    print(f"  {m}: {c:,} ({c / N * 100:.1f}%)")
print(f"\nCategory distribution (consensus):")
for c in cat_order:
    n = cat_counts_final[c]
    print(f"  {CAT_ABBREV[c]:4s} {n:>6,} ({n / N * 100:.1f}%)")
print(f"\nSpecificity distribution (consensus):")
for i in range(4):
    n = spec_counts_final.get(i + 1, 0)
    print(f"  L{i + 1}  {n:>6,} ({n / N * 100:.1f}%)")