""" Stage 1 (v2) distribution charts for the writeup. Generates: category distribution, specificity distribution, cross-run agreement, consensus method breakdown, and specificity disagreement boundary analysis. Usage: uvx --with matplotlib --with numpy python scripts/plot-stage1-distributions.py """ import json import collections from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.ticker as mtick import numpy as np DATA = Path(__file__).resolve().parent.parent / "data" FIGS = Path(__file__).resolve().parent.parent / "figures" FIGS.mkdir(exist_ok=True) # ── Color palette ──────────────────────────────────────────────────────────── CAT_COLORS = { "Risk Management Process": "#2196F3", "Board Governance": "#4CAF50", "Management Role": "#FF9800", "Strategy Integration": "#9C27B0", "None/Other": "#607D8B", "Third-Party Risk": "#F44336", "Incident Disclosure": "#00BCD4", } CAT_ABBREV = { "Risk Management Process": "RMP", "Board Governance": "BG", "Management Role": "MR", "Strategy Integration": "SI", "None/Other": "N/O", "Third-Party Risk": "TP", "Incident Disclosure": "ID", } SPEC_COLORS = ["#BDBDBD", "#64B5F6", "#FFB74D", "#EF5350"] SPEC_LABELS = ["L1: Generic\nBoilerplate", "L2: Domain-\nAdapted", "L3: Firm-\nSpecific", "L4: Quantified-\nVerifiable"] # ── Load data ──────────────────────────────────────────────────────────────── runs = {} for run in [1, 2, 3]: path = DATA / f"annotations/v2-stage1/grok-4.1-fast.run{run}.jsonl" runs[run] = {} with open(path) as f: for line in f: r = json.loads(line) runs[run][r["paragraphId"]] = r["label"] # Load judge results judge = {} judge_path = DATA / "annotations/v2-stage1/judge.jsonl" if judge_path.exists(): with open(judge_path) as f: for line in f: r = json.loads(line) judge[r["paragraphId"]] = r["label"] all_ids = sorted(set(runs[1]) & set(runs[2]) & set(runs[3])) N = len(all_ids) print(f"Loaded {N} paragraphs across 3 runs, {len(judge)} judge results") # ── Compute consensus labels ───────────────────────────────────────────────── final_cats = [] final_specs = [] consensus_methods = collections.Counter() for pid in all_ids: cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]] specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]] cat_counts = collections.Counter(cats) spec_counts = collections.Counter(specs) cat_max = max(cat_counts.values()) spec_max = max(spec_counts.values()) if cat_max == 3 and spec_max == 3: consensus_methods["Unanimous (3/3)"] += 1 final_cats.append(cat_counts.most_common(1)[0][0]) final_specs.append(spec_counts.most_common(1)[0][0]) elif cat_max >= 2 and spec_max >= 2: consensus_methods["Majority (2/3)"] += 1 final_cats.append(cat_counts.most_common(1)[0][0]) final_specs.append(spec_counts.most_common(1)[0][0]) else: # Judge tiebreaker if pid in judge: consensus_methods["Judge tiebreaker"] += 1 final_cats.append(judge[pid]["content_category"]) final_specs.append(judge[pid]["specificity_level"]) else: consensus_methods["Unresolved"] += 1 final_cats.append(cat_counts.most_common(1)[0][0]) final_specs.append(spec_counts.most_common(1)[0][0]) plt.rcParams.update({ "font.family": "sans-serif", "font.size": 11, "axes.titlesize": 13, "axes.titleweight": "bold", "figure.facecolor": "white", }) # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 1: Category Distribution (final consensus) # ══════════════════════════════════════════════════════════════════════════════ cat_counts_final = collections.Counter(final_cats) cat_order = ["Risk Management Process", "Board Governance", "Management Role", "Strategy Integration", "None/Other", "Third-Party Risk", "Incident Disclosure"] fig, ax = plt.subplots(figsize=(10, 5)) x = np.arange(len(cat_order)) counts = [cat_counts_final[c] for c in cat_order] colors = [CAT_COLORS[c] for c in cat_order] bars = ax.bar(x, counts, color=colors, edgecolor="white", linewidth=0.5) for bar, count in zip(bars, counts): pct = count / N * 100 ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200, f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9) ax.set_xticks(x) ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11) ax.set_ylabel("Paragraphs") ax.set_title("Content Category Distribution — Stage 1 Consensus (72,045 paragraphs)") ax.set_ylim(0, max(counts) * 1.18) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) fig.tight_layout() fig.savefig(FIGS / "stage1-category-distribution.png", dpi=200) plt.close(fig) print(" ✓ stage1-category-distribution.png") # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 2: Specificity Distribution (final consensus) # ══════════════════════════════════════════════════════════════════════════════ spec_counts_final = collections.Counter(final_specs) fig, ax = plt.subplots(figsize=(8, 5)) x = np.arange(4) counts = [spec_counts_final.get(i + 1, 0) for i in range(4)] bars = ax.bar(x, counts, color=SPEC_COLORS, edgecolor="white", linewidth=0.5) for bar, count in zip(bars, counts): pct = count / N * 100 ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200, f"{count:,}\n({pct:.1f}%)", ha="center", va="bottom", fontsize=9) ax.set_xticks(x) ax.set_xticklabels(SPEC_LABELS, fontsize=10) ax.set_ylabel("Paragraphs") ax.set_title("Specificity Distribution — Stage 1 Consensus (72,045 paragraphs)") ax.set_ylim(0, max(counts) * 1.18) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) fig.tight_layout() fig.savefig(FIGS / "stage1-specificity-distribution.png", dpi=200) plt.close(fig) print(" ✓ stage1-specificity-distribution.png") # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 3: Cross-run agreement (stacked bar showing unanimity rates) # ══════════════════════════════════════════════════════════════════════════════ cat_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0} spec_agreement = {"Unanimous": 0, "Majority": 0, "All disagree": 0} for pid in all_ids: cats = [runs[r][pid]["content_category"] for r in [1, 2, 3]] specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]] cat_n = len(set(cats)) spec_n = len(set(specs)) if cat_n == 1: cat_agreement["Unanimous"] += 1 elif cat_n == 2: cat_agreement["Majority"] += 1 else: cat_agreement["All disagree"] += 1 if spec_n == 1: spec_agreement["Unanimous"] += 1 elif spec_n == 2: spec_agreement["Majority"] += 1 else: spec_agreement["All disagree"] += 1 fig, ax = plt.subplots(figsize=(8, 5)) dims = ["Category", "Specificity"] unanimous = [cat_agreement["Unanimous"] / N * 100, spec_agreement["Unanimous"] / N * 100] majority = [cat_agreement["Majority"] / N * 100, spec_agreement["Majority"] / N * 100] alldis = [cat_agreement["All disagree"] / N * 100, spec_agreement["All disagree"] / N * 100] x = np.arange(len(dims)) w = 0.5 b1 = ax.bar(x, unanimous, w, label="Unanimous (3/3)", color="#4CAF50") b2 = ax.bar(x, majority, w, bottom=unanimous, label="Majority (2/3)", color="#FFC107") b3 = ax.bar(x, alldis, w, bottom=[u + m for u, m in zip(unanimous, majority)], label="All disagree", color="#F44336") for i, (u, m, a) in enumerate(zip(unanimous, majority, alldis)): ax.text(i, u / 2, f"{u:.1f}%", ha="center", va="center", fontsize=11, fontweight="bold", color="white") if m > 2: ax.text(i, u + m / 2, f"{m:.1f}%", ha="center", va="center", fontsize=10, color="black") if a > 0.5: ax.text(i, u + m + a / 2, f"{a:.2f}%", ha="center", va="center", fontsize=8, color="white") ax.set_xticks(x) ax.set_xticklabels(dims, fontsize=12) ax.set_ylabel("Percentage of paragraphs") ax.set_title("Grok ×3 Cross-Run Agreement (72,045 paragraphs)") ax.set_ylim(0, 105) ax.legend(loc="upper right", fontsize=10) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) fig.tight_layout() fig.savefig(FIGS / "stage1-cross-run-agreement.png", dpi=200) plt.close(fig) print(" ✓ stage1-cross-run-agreement.png") # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 4: Consensus method breakdown (pie/donut) # ══════════════════════════════════════════════════════════════════════════════ fig, ax = plt.subplots(figsize=(7, 7)) method_order = ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"] method_counts = [consensus_methods.get(m, 0) for m in method_order] method_colors = ["#4CAF50", "#FFC107", "#2196F3"] wedges, texts, autotexts = ax.pie( method_counts, labels=method_order, colors=method_colors, autopct=lambda p: f"{p:.1f}%\n({int(round(p * N / 100)):,})", startangle=90, pctdistance=0.75, wedgeprops=dict(width=0.45, edgecolor="white", linewidth=2), ) for t in autotexts: t.set_fontsize(10) ax.set_title("Consensus Resolution Method — Stage 1") fig.tight_layout() fig.savefig(FIGS / "stage1-consensus-methods.png", dpi=200) plt.close(fig) print(" ✓ stage1-consensus-methods.png") # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 5: Specificity boundary disagreements (where runs diverge) # ══════════════════════════════════════════════════════════════════════════════ boundary_counts = collections.Counter() for pid in all_ids: specs = [runs[r][pid]["specificity_level"] for r in [1, 2, 3]] if len(set(specs)) == 1: continue low, high = min(specs), max(specs) boundary_counts[f"L{low}↔L{high}"] += 1 fig, ax = plt.subplots(figsize=(8, 5)) boundaries = sorted(boundary_counts.keys()) counts = [boundary_counts[b] for b in boundaries] colors_b = ["#90CAF9" if "1↔2" in b else "#FFE082" if "2↔3" in b or "1↔3" in b else "#EF9A9A" if "3↔4" in b else "#CE93D8" for b in boundaries] bars = ax.barh(boundaries, counts, color=colors_b, edgecolor="white") for bar, count in zip(bars, counts): ax.text(bar.get_width() + 20, bar.get_y() + bar.get_height() / 2, f"{count:,} ({count / N * 100:.1f}%)", va="center", fontsize=10) ax.set_xlabel("Paragraphs with divergent specificity") ax.set_title("Specificity Boundary Disagreements Across 3 Grok Runs") ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_xlim(0, max(counts) * 1.25) fig.tight_layout() fig.savefig(FIGS / "stage1-specificity-boundaries.png", dpi=200) plt.close(fig) print(" ✓ stage1-specificity-boundaries.png") # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 6: Category × Specificity heatmap (final consensus) # ══════════════════════════════════════════════════════════════════════════════ cat_spec_matrix = np.zeros((len(cat_order), 4)) for cat, spec in zip(final_cats, final_specs): i = cat_order.index(cat) cat_spec_matrix[i, spec - 1] += 1 # Normalize to row percentages row_sums = cat_spec_matrix.sum(axis=1, keepdims=True) cat_spec_pct = cat_spec_matrix / row_sums * 100 fig, ax = plt.subplots(figsize=(9, 6)) im = ax.imshow(cat_spec_pct, cmap="YlOrRd", aspect="auto") for i in range(len(cat_order)): for j in range(4): count = int(cat_spec_matrix[i, j]) pct = cat_spec_pct[i, j] color = "white" if pct > 50 else "black" ax.text(j, i, f"{count:,}\n({pct:.0f}%)", ha="center", va="center", fontsize=8, color=color) ax.set_xticks(range(4)) ax.set_xticklabels(["L1", "L2", "L3", "L4"], fontsize=11) ax.set_yticks(range(len(cat_order))) ax.set_yticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11) ax.set_xlabel("Specificity Level") ax.set_ylabel("Content Category") ax.set_title("Category × Specificity — Stage 1 Consensus (row %)") fig.colorbar(im, ax=ax, label="Row %", shrink=0.8) fig.tight_layout() fig.savefig(FIGS / "stage1-category-specificity-heatmap.png", dpi=200) plt.close(fig) print(" ✓ stage1-category-specificity-heatmap.png") # ══════════════════════════════════════════════════════════════════════════════ # FIGURE 7: v1 vs v2 category comparison # ══════════════════════════════════════════════════════════════════════════════ # v1 distribution from STATUS.md (50,003 paragraphs, different base) v1_pct = { "Risk Management Process": 45.8, "Management Role": 17.6, "Board Governance": 16.0, "Strategy Integration": 10.0, "None/Other": 5.0, "Third-Party Risk": 5.0, "Incident Disclosure": 0.6, } v2_pct = {c: cat_counts_final[c] / N * 100 for c in cat_order} fig, ax = plt.subplots(figsize=(10, 5)) x = np.arange(len(cat_order)) w = 0.35 b1 = ax.bar(x - w / 2, [v1_pct[c] for c in cat_order], w, label="v1 (50K, 3-model panel)", color="#90CAF9", edgecolor="white") b2 = ax.bar(x + w / 2, [v2_pct[c] for c in cat_order], w, label="v2 (72K, Grok ×3)", color="#2196F3", edgecolor="white") for bar_group in [b1, b2]: for bar in bar_group: h = bar.get_height() ax.text(bar.get_x() + bar.get_width() / 2, h + 0.3, f"{h:.1f}%", ha="center", va="bottom", fontsize=8) ax.set_xticks(x) ax.set_xticklabels([CAT_ABBREV[c] for c in cat_order], fontsize=11) ax.set_ylabel("Percentage") ax.set_title("Category Distribution: v1 vs v2 Stage 1") ax.legend(fontsize=10) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) fig.tight_layout() fig.savefig(FIGS / "stage1-v1-vs-v2-categories.png", dpi=200) plt.close(fig) print(" ✓ stage1-v1-vs-v2-categories.png") # ── Print summary stats ────────────────────────────────────────────────────── print(f"\n{'═' * 60}") print(f"Stage 1 Consensus Summary") print(f"{'═' * 60}") print(f"Total paragraphs: {N:,}") print(f"\nConsensus methods:") for m in ["Unanimous (3/3)", "Majority (2/3)", "Judge tiebreaker"]: c = consensus_methods.get(m, 0) print(f" {m}: {c:,} ({c / N * 100:.1f}%)") print(f"\nCategory distribution (consensus):") for c in cat_order: n = cat_counts_final[c] print(f" {CAT_ABBREV[c]:4s} {n:>6,} ({n / N * 100:.1f}%)") print(f"\nSpecificity distribution (consensus):") for i in range(4): n = spec_counts_final.get(i + 1, 0) print(f" L{i + 1} {n:>6,} ({n / N * 100:.1f}%)")