374 lines
17 KiB
Python
374 lines
17 KiB
Python
"""
|
||
Stage 1 (v2) distribution charts for the writeup.
|
||
Generates: category distribution, specificity distribution,
|
||
cross-run agreement, consensus method breakdown, and
|
||
specificity disagreement boundary analysis.
|
||
|
||
Usage: uvx --with matplotlib --with numpy python scripts/plot-stage1-distributions.py
|
||
"""
|
||
|
||
import json
|
||
import collections
|
||
from pathlib import Path
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib.ticker as mtick
|
||
import numpy as np
|
||
|
||
DATA = Path(__file__).resolve().parent.parent / "data"
|
||
FIGS = Path(__file__).resolve().parent.parent / "figures"
|
||
FIGS.mkdir(exist_ok=True)
|
||
|
||
# ── Color palette ────────────────────────────────────────────────────────────
|
||
CAT_COLORS = {
|
||
"Risk Management Process": "#2196F3",
|
||
"Board Governance": "#4CAF50",
|
||
"Management Role": "#FF9800",
|
||
"Strategy Integration": "#9C27B0",
|
||
"None/Other": "#607D8B",
|
||
"Third-Party Risk": "#F44336",
|
||
"Incident Disclosure": "#00BCD4",
|
||
}
|
||
CAT_ABBREV = {
|
||
"Risk Management Process": "RMP",
|
||
"Board Governance": "BG",
|
||
"Management Role": "MR",
|
||
"Strategy Integration": "SI",
|
||
"None/Other": "N/O",
|
||
"Third-Party Risk": "TP",
|
||
"Incident Disclosure": "ID",
|
||
}
|
||
SPEC_COLORS = ["#BDBDBD", "#64B5F6", "#FFB74D", "#EF5350"]
|
||
SPEC_LABELS = ["L1: Generic\nBoilerplate", "L2: Domain-\nAdapted", "L3: Firm-\nSpecific", "L4: Quantified-\nVerifiable"]
|
||
|
||
# ── Load data ────────────────────────────────────────────────────────────────
|
||
runs = {}
|
||
for run in [1, 2, 3]:
|
||
path = DATA / f"annotations/v2-stage1/grok-4.1-fast.run{run}.jsonl"
|
||
runs[run] = {}
|
||
with open(path) as f:
|
||
for line in f:
|
||
r = json.loads(line)
|
||
runs[run][r["paragraphId"]] = r["label"]
|
||
|
||
# Load judge results
|
||
judge = {}
|
||
judge_path = DATA / "annotations/v2-stage1/judge.jsonl"
|
||
if judge_path.exists():
|
||
with open(judge_path) as f:
|
||
for line in f:
|
||
r = json.loads(line)
|
||
judge[r["paragraphId"]] = r["label"]
|
||
|
||
all_ids = sorted(set(runs[1]) & set(runs[2]) & set(runs[3]))
|
||
N = len(all_ids)
|
||
print(f"Loaded {N} paragraphs across 3 runs, {len(judge)} judge results")
|
||
|
||
# ── Compute consensus labels ─────────────────────────────────────────────────
|
||
final_cats = []
|
||
final_specs = []
|
||
consensus_methods = collections.Counter()
|
||
|
||
for pid in all_ids:
|
||
cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]]
|
||
specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
|
||
cat_counts = collections.Counter(cats)
|
||
spec_counts = collections.Counter(specs)
|
||
cat_max = max(cat_counts.values())
|
||
spec_max = max(spec_counts.values())
|
||
|
||
if cat_max == 3 and spec_max == 3:
|
||
consensus_methods["Unanimous (3/3)"] += 1
|
||
final_cats.append(cat_counts.most_common(1)[0][0])
|
||
final_specs.append(spec_counts.most_common(1)[0][0])
|
||
elif cat_max >= 2 and spec_max >= 2:
|
||
consensus_methods["Majority (2/3)"] += 1
|
||
final_cats.append(cat_counts.most_common(1)[0][0])
|
||
final_specs.append(spec_counts.most_common(1)[0][0])
|
||
else:
|
||
# Judge tiebreaker
|
||
if pid in judge:
|
||
consensus_methods["Judge tiebreaker"] += 1
|
||
final_cats.append(judge[pid]["content_category"])
|
||
final_specs.append(judge[pid]["specificity_level"])
|
||
else:
|
||
consensus_methods["Unresolved"] += 1
|
||
final_cats.append(cat_counts.most_common(1)[0][0])
|
||
final_specs.append(spec_counts.most_common(1)[0][0])
|
||
|
||
plt.rcParams.update({
|
||
"font.family": "sans-serif",
|
||
"font.size": 11,
|
||
"axes.titlesize": 13,
|
||
"axes.titleweight": "bold",
|
||
"figure.facecolor": "white",
|
||
})
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 1: Category Distribution (final consensus)
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
cat_counts_final = collections.Counter(final_cats)
|
||
cat_order = ["Risk Management Process", "Board Governance", "Management Role",
|
||
"Strategy Integration", "None/Other", "Third-Party Risk", "Incident Disclosure"]
|
||
|
||
fig, ax = plt.subplots(figsize=(10, 5))
|
||
x = np.arange(len(cat_order))
|
||
counts = [cat_counts_final[c] for c in cat_order]
|
||
colors = [CAT_COLORS[c] for c in cat_order]
|
||
bars = ax.bar(x, counts, color=colors, edgecolor="white", linewidth=0.5)
|
||
|
||
for bar, count in zip(bars, counts):
|
||
pct = count / N * 100
|
||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
|
||
f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
|
||
ax.set_ylabel("Paragraphs")
|
||
ax.set_title("Content Category Distribution — Stage 1 Consensus (72,045 paragraphs)")
|
||
ax.set_ylim(0, max(counts) * 1.18)
|
||
ax.spines["top"].set_visible(False)
|
||
ax.spines["right"].set_visible(False)
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-category-distribution.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-category-distribution.png")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 2: Specificity Distribution (final consensus)
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
spec_counts_final = collections.Counter(final_specs)
|
||
|
||
fig, ax = plt.subplots(figsize=(8, 5))
|
||
x = np.arange(4)
|
||
counts = [spec_counts_final.get(i + 1, 0) for i in range(4)]
|
||
bars = ax.bar(x, counts, color=SPEC_COLORS, edgecolor="white", linewidth=0.5)
|
||
|
||
for bar, count in zip(bars, counts):
|
||
pct = count / N * 100
|
||
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
|
||
f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(SPEC_LABELS, fontsize=10)
|
||
ax.set_ylabel("Paragraphs")
|
||
ax.set_title("Specificity Distribution — Stage 1 Consensus (72,045 paragraphs)")
|
||
ax.set_ylim(0, max(counts) * 1.18)
|
||
ax.spines["top"].set_visible(False)
|
||
ax.spines["right"].set_visible(False)
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-specificity-distribution.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-specificity-distribution.png")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 3: Cross-run agreement (stacked bar showing unanimity rates)
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
cat_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0}
|
||
spec_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0}
|
||
|
||
for pid in all_ids:
|
||
cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]]
|
||
specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
|
||
cat_n = len(set(cats))
|
||
spec_n = len(set(specs))
|
||
|
||
if cat_n == 1: cat_agreement["Unanimous"] += 1
|
||
elif cat_n == 2: cat_agreement["Majority"] += 1
|
||
else: cat_agreement["All disagree"] += 1
|
||
|
||
if spec_n == 1: spec_agreement["Unanimous"] += 1
|
||
elif spec_n == 2: spec_agreement["Majority"] += 1
|
||
else: spec_agreement["All disagree"] += 1
|
||
|
||
fig, ax = plt.subplots(figsize=(8, 5))
|
||
dims = ["Category", "Specificity"]
|
||
unanimous = [cat_agreement["Unanimous"] / N * 100, spec_agreement["Unanimous"] / N * 100]
|
||
majority = [cat_agreement["Majority"] / N * 100, spec_agreement["Majority"] / N * 100]
|
||
alldis = [cat_agreement["All disagree"] / N * 100, spec_agreement["All disagree"] / N * 100]
|
||
|
||
x = np.arange(len(dims))
|
||
w = 0.5
|
||
b1 = ax.bar(x, unanimous, w, label="Unanimous (3/3)", color="#4CAF50")
|
||
b2 = ax.bar(x, majority, w, bottom=unanimous, label="Majority (2/3)", color="#FFC107")
|
||
b3 = ax.bar(x, alldis, w, bottom=[u + m for u, m in zip(unanimous, majority)],
|
||
label="All disagree", color="#F44336")
|
||
|
||
for i, (u, m, a) in enumerate(zip(unanimous, majority, alldis)):
|
||
ax.text(i, u / 2, f"{u:.1f}%", ha="center", va="center", fontsize=11, fontweight="bold", color="white")
|
||
if m > 2:
|
||
ax.text(i, u + m / 2, f"{m:.1f}%", ha="center", va="center", fontsize=10, color="black")
|
||
if a > 0.5:
|
||
ax.text(i, u + m + a / 2, f"{a:.2f}%", ha="center", va="center", fontsize=8, color="white")
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(dims, fontsize=12)
|
||
ax.set_ylabel("Percentage of paragraphs")
|
||
ax.set_title("Grok ×3 Cross-Run Agreement (72,045 paragraphs)")
|
||
ax.set_ylim(0, 105)
|
||
ax.legend(loc="upper right", fontsize=10)
|
||
ax.spines["top"].set_visible(False)
|
||
ax.spines["right"].set_visible(False)
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-cross-run-agreement.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-cross-run-agreement.png")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 4: Consensus method breakdown (pie/donut)
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
fig, ax = plt.subplots(figsize=(7, 7))
|
||
method_order = ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]
|
||
method_counts = [consensus_methods.get(m, 0) for m in method_order]
|
||
method_colors = ["#4CAF50", "#FFC107", "#2196F3"]
|
||
|
||
wedges, texts, autotexts = ax.pie(
|
||
method_counts, labels=method_order, colors=method_colors,
|
||
autopct=lambda p: f"{p:.1f}%\n({int(round(p * N / 100)):,})",
|
||
startangle=90, pctdistance=0.75,
|
||
wedgeprops=dict(width=0.45, edgecolor="white", linewidth=2),
|
||
)
|
||
for t in autotexts:
|
||
t.set_fontsize(10)
|
||
ax.set_title("Consensus Resolution Method — Stage 1")
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-consensus-methods.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-consensus-methods.png")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 5: Specificity boundary disagreements (where runs diverge)
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
boundary_counts = collections.Counter()
|
||
for pid in all_ids:
|
||
specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]]
|
||
if len(set(specs)) == 1:
|
||
continue
|
||
low, high = min(specs), max(specs)
|
||
boundary_counts[f"L{low}↔L{high}"] += 1
|
||
|
||
fig, ax = plt.subplots(figsize=(8, 5))
|
||
boundaries = sorted(boundary_counts.keys())
|
||
counts = [boundary_counts[b] for b in boundaries]
|
||
colors_b = ["#90CAF9" if "1↔2" in b else "#FFE082" if "2↔3" in b or "1↔3" in b
|
||
else "#EF9A9A" if "3↔4" in b else "#CE93D8" for b in boundaries]
|
||
bars = ax.barh(boundaries, counts, color=colors_b, edgecolor="white")
|
||
|
||
for bar, count in zip(bars, counts):
|
||
ax.text(bar.get_width() + 20, bar.get_y() + bar.get_height() / 2,
|
||
f"{count:,} ({count / N * 100:.1f}%)", va="center", fontsize=10)
|
||
|
||
ax.set_xlabel("Paragraphs with divergent specificity")
|
||
ax.set_title("Specificity Boundary Disagreements Across 3 Grok Runs")
|
||
ax.spines["top"].set_visible(False)
|
||
ax.spines["right"].set_visible(False)
|
||
ax.set_xlim(0, max(counts) * 1.25)
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-specificity-boundaries.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-specificity-boundaries.png")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 6: Category × Specificity heatmap (final consensus)
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
cat_spec_matrix = np.zeros((len(cat_order), 4))
|
||
for cat, spec in zip(final_cats, final_specs):
|
||
i = cat_order.index(cat)
|
||
cat_spec_matrix[i, spec - 1] += 1
|
||
|
||
# Normalize to row percentages
|
||
row_sums = cat_spec_matrix.sum(axis=1, keepdims=True)
|
||
cat_spec_pct = cat_spec_matrix / row_sums * 100
|
||
|
||
fig, ax = plt.subplots(figsize=(9, 6))
|
||
im = ax.imshow(cat_spec_pct, cmap="YlOrRd", aspect="auto")
|
||
|
||
for i in range(len(cat_order)):
|
||
for j in range(4):
|
||
count = int(cat_spec_matrix[i, j])
|
||
pct = cat_spec_pct[i, j]
|
||
color = "white" if pct > 50 else "black"
|
||
ax.text(j, i, f"{count:,}\n({pct:.0f}%)", ha="center", va="center",
|
||
fontsize=8, color=color)
|
||
|
||
ax.set_xticks(range(4))
|
||
ax.set_xticklabels(["L1", "L2", "L3", "L4"], fontsize=11)
|
||
ax.set_yticks(range(len(cat_order)))
|
||
ax.set_yticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
|
||
ax.set_xlabel("Specificity Level")
|
||
ax.set_ylabel("Content Category")
|
||
ax.set_title("Category × Specificity — Stage 1 Consensus (row %)")
|
||
fig.colorbar(im, ax=ax, label="Row %", shrink=0.8)
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-category-specificity-heatmap.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-category-specificity-heatmap.png")
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# FIGURE 7: v1 vs v2 category comparison
|
||
# ══════════════════════════════════════════════════════════════════════════════
|
||
# v1 distribution from STATUS.md (50,003 paragraphs, different base)
|
||
v1_pct = {
|
||
"Risk Management Process": 45.8,
|
||
"Management Role": 17.6,
|
||
"Board Governance": 16.0,
|
||
"Strategy Integration": 10.0,
|
||
"None/Other": 5.0,
|
||
"Third-Party Risk": 5.0,
|
||
"Incident Disclosure": 0.6,
|
||
}
|
||
v2_pct = {c: cat_counts_final[c] / N * 100 for c in cat_order}
|
||
|
||
fig, ax = plt.subplots(figsize=(10, 5))
|
||
x = np.arange(len(cat_order))
|
||
w = 0.35
|
||
b1 = ax.bar(x - w / 2, [v1_pct[c] for c in cat_order], w, label="v1 (50K, 3-model panel)",
|
||
color="#90CAF9", edgecolor="white")
|
||
b2 = ax.bar(x + w / 2, [v2_pct[c] for c in cat_order], w, label="v2 (72K, Grok ×3)",
|
||
color="#2196F3", edgecolor="white")
|
||
|
||
for bar_group in [b1, b2]:
|
||
for bar in bar_group:
|
||
h = bar.get_height()
|
||
ax.text(bar.get_x() + bar.get_width() / 2, h + 0.3,
|
||
f"{h:.1f}%", ha="center", va="bottom", fontsize=8)
|
||
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11)
|
||
ax.set_ylabel("Percentage")
|
||
ax.set_title("Category Distribution: v1 vs v2 Stage 1")
|
||
ax.legend(fontsize=10)
|
||
ax.spines["top"].set_visible(False)
|
||
ax.spines["right"].set_visible(False)
|
||
fig.tight_layout()
|
||
fig.savefig(FIGS / "stage1-v1-vs-v2-categories.png", dpi=200)
|
||
plt.close(fig)
|
||
print(" ✓ stage1-v1-vs-v2-categories.png")
|
||
|
||
|
||
# ── Print summary stats ──────────────────────────────────────────────────────
|
||
print(f"\n{'═' * 60}")
|
||
print(f"Stage 1 Consensus Summary")
|
||
print(f"{'═' * 60}")
|
||
print(f"Total paragraphs: {N:,}")
|
||
print(f"\nConsensus methods:")
|
||
for m in ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]:
|
||
c = consensus_methods.get(m, 0)
|
||
print(f" {m}: {c:,} ({c / N * 100:.1f}%)")
|
||
print(f"\nCategory distribution (consensus):")
|
||
for c in cat_order:
|
||
n = cat_counts_final[c]
|
||
print(f" {CAT_ABBREV[c]:4s} {n:>6,} ({n / N * 100:.1f}%)")
|
||
print(f"\nSpecificity distribution (consensus):")
|
||
for i in range(4):
|
||
n = spec_counts_final.get(i + 1, 0)
|
||
print(f" L{i + 1} {n:>6,} ({n / N * 100:.1f}%)")
|