SEC-cyBERT/scripts/plot-stage1-distributions.py
2026-04-05 01:30:39 -04:00

374 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Stage 1 (v2) distribution charts for the writeup.
Generates: category distribution, specificity distribution,
cross-run agreement, consensus method breakdown, and
specificity disagreement boundary analysis.
Usage: uvx --with matplotlib --with numpy python scripts/plot-stage1-distributions.py
"""
import json
import collections
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
DATA = Path(__file__).resolve().parent.parent / "data"
FIGS = Path(__file__).resolve().parent.parent / "figures"
FIGS.mkdir(exist_ok=True)
# ── Color palette ────────────────────────────────────────────────────────────
CAT_COLORS = {
"Risk Management Process": "#2196F3",
"Board Governance": "#4CAF50",
"Management Role": "#FF9800",
"Strategy Integration": "#9C27B0",
"None/Other": "#607D8B",
"Third-Party Risk": "#F44336",
"Incident Disclosure": "#00BCD4",
}
CAT_ABBREV = {
"Risk Management Process": "RMP",
"Board Governance": "BG",
"Management Role": "MR",
"Strategy Integration": "SI",
"None/Other": "N/O",
"Third-Party Risk": "TP",
"Incident Disclosure": "ID",
}
SPEC_COLORS = ["#BDBDBD", "#64B5F6", "#FFB74D", "#EF5350"]
SPEC_LABELS = ["L1: Generic\nBoilerplate", "L2: Domain-\nAdapted", "L3: Firm-\nSpecific", "L4: Quantified-\nVerifiable"]
# ── Load data ────────────────────────────────────────────────────────────────
runs = {}
for run in [1, 2, 3]:
path = DATA / f"annotations/v2-stage1/grok-4.1-fast.run{run}.jsonl"
runs[run] = {}
with open(path) as f:
for line in f:
r = json.loads(line)
runs[run][r["paragraphId"]] = r["label"]
# Load judge results
judge = {}
judge_path = DATA / "annotations/v2-stage1/judge.jsonl"
if judge_path.exists():
with open(judge_path) as f:
for line in f:
r = json.loads(line)
judge[r["paragraphId"]] = r["label"]
all_ids = sorted(set(runs[1]) & set(runs[2]) & set(runs[3]))
N = len(all_ids)
print(f"Loaded {N} paragraphs across 3 runs, {len(judge)} judge results")
# ── Compute consensus labels ─────────────────────────────────────────────────
final_cats = []
final_specs = []
consensus_methods = collections.Counter()
for pid in all_ids:
cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]]
specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
cat_counts = collections.Counter(cats)
spec_counts = collections.Counter(specs)
cat_max = max(cat_counts.values())
spec_max = max(spec_counts.values())
if cat_max == 3 and spec_max == 3:
consensus_methods["Unanimous (3/3)"] += 1
final_cats.append(cat_counts.most_common(1)[0][0])
final_specs.append(spec_counts.most_common(1)[0][0])
elif cat_max >= 2 and spec_max >= 2:
consensus_methods["Majority (2/3)"] += 1
final_cats.append(cat_counts.most_common(1)[0][0])
final_specs.append(spec_counts.most_common(1)[0][0])
else:
# Judge tiebreaker
if pid in judge:
consensus_methods["Judge tiebreaker"] += 1
final_cats.append(judge[pid]["content_category"])
final_specs.append(judge[pid]["specificity_level"])
else:
consensus_methods["Unresolved"] += 1
final_cats.append(cat_counts.most_common(1)[0][0])
final_specs.append(spec_counts.most_common(1)[0][0])
plt.rcParams.update({
"font.family": "sans-serif",
"font.size": 11,
"axes.titlesize": 13,
"axes.titleweight": "bold",
"figure.facecolor": "white",
})
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 1: Category Distribution (final consensus)
# ══════════════════════════════════════════════════════════════════════════════
cat_counts_final = collections.Counter(final_cats)
cat_order = ["Risk Management Process", "Board Governance", "Management Role",
"Strategy Integration", "None/Other", "Third-Party Risk", "Incident Disclosure"]
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(cat_order))
counts = [cat_counts_final[c] for c in cat_order]
colors = [CAT_COLORS[c] for c in cat_order]
bars = ax.bar(x, counts, color=colors, edgecolor="white", linewidth=0.5)
for bar, count in zip(bars, counts):
pct = count / N * 100
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9)
ax.set_xticks(x)
ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
ax.set_ylabel("Paragraphs")
ax.set_title("Content Category Distribution — Stage 1 Consensus (72,045 paragraphs)")
ax.set_ylim(0, max(counts) * 1.18)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-category-distribution.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-category-distribution.png")
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 2: Specificity Distribution (final consensus)
# ══════════════════════════════════════════════════════════════════════════════
spec_counts_final = collections.Counter(final_specs)
fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(4)
counts = [spec_counts_final.get(i + 1, 0) for i in range(4)]
bars = ax.bar(x, counts, color=SPEC_COLORS, edgecolor="white", linewidth=0.5)
for bar, count in zip(bars, counts):
pct = count / N * 100
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9)
ax.set_xticks(x)
ax.set_xticklabels(SPEC_LABELS, fontsize=10)
ax.set_ylabel("Paragraphs")
ax.set_title("Specificity Distribution — Stage 1 Consensus (72,045 paragraphs)")
ax.set_ylim(0, max(counts) * 1.18)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-specificity-distribution.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-specificity-distribution.png")
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 3: Cross-run agreement (stacked bar showing unanimity rates)
# ══════════════════════════════════════════════════════════════════════════════
cat_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0}
spec_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0}
for pid in all_ids:
cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]]
specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
cat_n = len(set(cats))
spec_n = len(set(specs))
if cat_n == 1: cat_agreement["Unanimous"] += 1
elif cat_n == 2: cat_agreement["Majority"] += 1
else: cat_agreement["All disagree"] += 1
if spec_n == 1: spec_agreement["Unanimous"] += 1
elif spec_n == 2: spec_agreement["Majority"] += 1
else: spec_agreement["All disagree"] += 1
fig, ax = plt.subplots(figsize=(8, 5))
dims = ["Category", "Specificity"]
unanimous = [cat_agreement["Unanimous"] / N * 100, spec_agreement["Unanimous"] / N * 100]
majority = [cat_agreement["Majority"] / N * 100, spec_agreement["Majority"] / N * 100]
alldis = [cat_agreement["All disagree"] / N * 100, spec_agreement["All disagree"] / N * 100]
x = np.arange(len(dims))
w = 0.5
b1 = ax.bar(x, unanimous, w, label="Unanimous (3/3)", color="#4CAF50")
b2 = ax.bar(x, majority, w, bottom=unanimous, label="Majority (2/3)", color="#FFC107")
b3 = ax.bar(x, alldis, w, bottom=[u + m for u, m in zip(unanimous, majority)],
label="All disagree", color="#F44336")
for i, (u, m, a) in enumerate(zip(unanimous, majority, alldis)):
ax.text(i, u / 2, f"{u:.1f}%", ha="center", va="center", fontsize=11, fontweight="bold", color="white")
if m > 2:
ax.text(i, u + m / 2, f"{m:.1f}%", ha="center", va="center", fontsize=10, color="black")
if a > 0.5:
ax.text(i, u + m + a / 2, f"{a:.2f}%", ha="center", va="center", fontsize=8, color="white")
ax.set_xticks(x)
ax.set_xticklabels(dims, fontsize=12)
ax.set_ylabel("Percentage of paragraphs")
ax.set_title("Grok ×3 Cross-Run Agreement (72,045 paragraphs)")
ax.set_ylim(0, 105)
ax.legend(loc="upper right", fontsize=10)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-cross-run-agreement.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-cross-run-agreement.png")
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 4: Consensus method breakdown (pie/donut)
# ══════════════════════════════════════════════════════════════════════════════
fig, ax = plt.subplots(figsize=(7, 7))
method_order = ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]
method_counts = [consensus_methods.get(m, 0) for m in method_order]
method_colors = ["#4CAF50", "#FFC107", "#2196F3"]
wedges, texts, autotexts = ax.pie(
method_counts, labels=method_order, colors=method_colors,
autopct=lambda p: f"{p:.1f}%\n({int(round(p * N / 100)):,})",
startangle=90, pctdistance=0.75,
wedgeprops=dict(width=0.45, edgecolor="white", linewidth=2),
)
for t in autotexts:
t.set_fontsize(10)
ax.set_title("Consensus Resolution Method — Stage 1")
fig.tight_layout()
fig.savefig(FIGS / "stage1-consensus-methods.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-consensus-methods.png")
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 5: Specificity boundary disagreements (where runs diverge)
# ══════════════════════════════════════════════════════════════════════════════
boundary_counts = collections.Counter()
for pid in all_ids:
specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
if len(set(specs)) == 1:
continue
low, high = min(specs), max(specs)
boundary_counts[f"L{low}↔L{high}"] += 1
fig, ax = plt.subplots(figsize=(8, 5))
boundaries = sorted(boundary_counts.keys())
counts = [boundary_counts[b] for b in boundaries]
colors_b = ["#90CAF9" if "1↔2" in b else "#FFE082" if "2↔3" in b or "1↔3" in b
else "#EF9A9A" if "3↔4" in b else "#CE93D8" for b in boundaries]
bars = ax.barh(boundaries, counts, color=colors_b, edgecolor="white")
for bar, count in zip(bars, counts):
ax.text(bar.get_width() + 20, bar.get_y() + bar.get_height() / 2,
f"{count:,} ({count / N * 100:.1f}%)", va="center", fontsize=10)
ax.set_xlabel("Paragraphs with divergent specificity")
ax.set_title("Specificity Boundary Disagreements Across 3 Grok Runs")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_xlim(0, max(counts) * 1.25)
fig.tight_layout()
fig.savefig(FIGS / "stage1-specificity-boundaries.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-specificity-boundaries.png")
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 6: Category × Specificity heatmap (final consensus)
# ══════════════════════════════════════════════════════════════════════════════
cat_spec_matrix = np.zeros((len(cat_order), 4))
for cat, spec in zip(final_cats, final_specs):
i = cat_order.index(cat)
cat_spec_matrix[i, spec - 1] += 1
# Normalize to row percentages
row_sums = cat_spec_matrix.sum(axis=1, keepdims=True)
cat_spec_pct = cat_spec_matrix / row_sums * 100
fig, ax = plt.subplots(figsize=(9, 6))
im = ax.imshow(cat_spec_pct, cmap="YlOrRd", aspect="auto")
for i in range(len(cat_order)):
for j in range(4):
count = int(cat_spec_matrix[i, j])
pct = cat_spec_pct[i, j]
color = "white" if pct > 50 else "black"
ax.text(j, i, f"{count:,}\n({pct:.0f}%)", ha="center", va="center",
fontsize=8, color=color)
ax.set_xticks(range(4))
ax.set_xticklabels(["L1", "L2", "L3", "L4"], fontsize=11)
ax.set_yticks(range(len(cat_order)))
ax.set_yticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
ax.set_xlabel("Specificity Level")
ax.set_ylabel("Content Category")
ax.set_title("Category × Specificity — Stage 1 Consensus (row %)")
fig.colorbar(im, ax=ax, label="Row %", shrink=0.8)
fig.tight_layout()
fig.savefig(FIGS / "stage1-category-specificity-heatmap.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-category-specificity-heatmap.png")
# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 7: v1 vs v2 category comparison
# ══════════════════════════════════════════════════════════════════════════════
# v1 distribution from STATUS.md (50,003 paragraphs, different base)
v1_pct = {
"Risk Management Process": 45.8,
"Management Role": 17.6,
"Board Governance": 16.0,
"Strategy Integration": 10.0,
"None/Other": 5.0,
"Third-Party Risk": 5.0,
"Incident Disclosure": 0.6,
}
v2_pct = {c: cat_counts_final[c] / N * 100 for c in cat_order}
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(cat_order))
w = 0.35
b1 = ax.bar(x - w / 2, [v1_pct[c] for c in cat_order], w, label="v1 (50K, 3-model panel)",
color="#90CAF9", edgecolor="white")
b2 = ax.bar(x + w / 2, [v2_pct[c] for c in cat_order], w, label="v2 (72K, Grok ×3)",
color="#2196F3", edgecolor="white")
for bar_group in [b1, b2]:
for bar in bar_group:
h = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2, h + 0.3,
f"{h:.1f}%", ha="center", va="bottom", fontsize=8)
ax.set_xticks(x)
ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
ax.set_ylabel("Percentage")
ax.set_title("Category Distribution: v1 vs v2 Stage 1")
ax.legend(fontsize=10)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.tight_layout()
fig.savefig(FIGS / "stage1-v1-vs-v2-categories.png", dpi=200)
plt.close(fig)
print(" ✓ stage1-v1-vs-v2-categories.png")
# ── Print summary stats ──────────────────────────────────────────────────────
print(f"\n{'' * 60}")
print(f"Stage 1 Consensus Summary")
print(f"{'' * 60}")
print(f"Total paragraphs: {N:,}")
print(f"\nConsensus methods:")
for m in ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]:
c = consensus_methods.get(m, 0)
print(f" {m}: {c:,} ({c / N * 100:.1f}%)")
print(f"\nCategory distribution (consensus):")
for c in cat_order:
n = cat_counts_final[c]
print(f" {CAT_ABBREV[c]:4s} {n:>6,} ({n / N * 100:.1f}%)")
print(f"\nSpecificity distribution (consensus):")
for i in range(4):
n = spec_counts_final.get(i + 1, 0)
print(f" L{i + 1} {n:>6,} ({n / N * 100:.1f}%)")