SEC-cyBERT/scripts/examine-hard-cases.py

#!/usr/bin/env python3
"""Examine hardest disagreement cases in the SEC cybersecurity holdout dataset.

Identifies paragraphs where the 13 annotation sources split on the three main
confusion axes (MR<->RMP, BG<->MR, SI<->N/O), shows representative examples,
extracts linguistic patterns, and recommends codebook rulings.

Run: uv run --with numpy scripts/examine-hard-cases.py
"""

import json
import os
import re
import textwrap
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np

# ── Constants ──────────────────────────────────────────────────────────────────

ROOT = Path(__file__).resolve().parent.parent

CAT_ABBREV = {
    "Board Governance": "BG",
    "Incident Disclosure": "ID",
    "Management Role": "MR",
    "None/Other": "N/O",
    "Risk Management Process": "RMP",
    "Strategy Integration": "SI",
    "Third-Party Risk": "TPR",
}
ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()}

AXES = [
    ("MR", "RMP", "MR <-> RMP"),
    ("BG", "MR", "BG <-> MR"),
    ("SI", "N/O", "SI <-> N/O"),
]

BENCH_FILES = [
    "gpt-5.4.jsonl",
    "gemini-3.1-pro-preview.jsonl",
    "glm-5:exacto.jsonl",
    "kimi-k2.5.jsonl",
    "mimo-v2-pro:exacto.jsonl",
    "minimax-m2.7:exacto.jsonl",
]

STAGE1_MODEL_SHORT = {
    "google/gemini-3.1-flash-lite-preview": "s1:gemini-flash",
    "x-ai/grok-4.1-fast": "s1:grok-fast",
    "xiaomi/mimo-v2-flash": "s1:mimo-flash",
}

BENCH_MODEL_SHORT = {
    "gpt-5.4.jsonl": "bench:gpt5.4",
    "gemini-3.1-pro-preview.jsonl": "bench:gemini-pro",
    "glm-5:exacto.jsonl": "bench:glm5",
    "kimi-k2.5.jsonl": "bench:kimi",
    "mimo-v2-pro:exacto.jsonl": "bench:mimo-pro",
    "minimax-m2.7:exacto.jsonl": "bench:minimax",
}


# ── Load data ──────────────────────────────────────────────────────────────────


def load_jsonl(path: str | Path) -> list[dict]:
    records = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records


def abbrev(cat: str) -> str:
    return CAT_ABBREV.get(cat, cat)


def build_signal_matrix() -> tuple[dict[str, dict[str, str]], dict[str, dict[str, int]]]:
    """Build paragraphId -> {source: category_abbrev} and {source: specificity}."""
    # Only for the 1200 gold PIDs
    gold_pids: set[str] = set()
    human_labels = load_jsonl(ROOT / "data/gold/human-labels-raw.jsonl")
    for rec in human_labels:
        gold_pids.add(rec["paragraphId"])

    cat_matrix: dict[str, dict[str, str]] = defaultdict(dict)
    spec_matrix: dict[str, dict[str, int]] = defaultdict(dict)

    # 1) Human annotators (3 per paragraph)
    for rec in human_labels:
        pid = rec["paragraphId"]
        src = f"human:{rec['annotatorName']}"
        cat_matrix[pid][src] = abbrev(rec["contentCategory"])
        spec_matrix[pid][src] = rec["specificityLevel"]

    # 2) Stage 1 models (filter to gold PIDs)
    stage1_path = ROOT / "data/annotations/stage1.patched.jsonl"
    with open(stage1_path) as f:
        for line in f:
            rec = json.loads(line)
            pid = rec["paragraphId"]
            if pid not in gold_pids:
                continue
            model_id = rec["provenance"]["modelId"]
            src = STAGE1_MODEL_SHORT.get(model_id, model_id)
            cat_matrix[pid][src] = abbrev(rec["label"]["content_category"])
            spec_matrix[pid][src] = rec["label"]["specificity_level"]

    # 3) Opus
    for rec in load_jsonl(ROOT / "data/annotations/golden/opus.jsonl"):
        pid = rec["paragraphId"]
        if pid in gold_pids:
            cat_matrix[pid]["opus"] = abbrev(rec["label"]["content_category"])
            spec_matrix[pid]["opus"] = rec["label"]["specificity_level"]

    # 4) Bench-holdout models
    for fn in BENCH_FILES:
        src = BENCH_MODEL_SHORT[fn]
        for rec in load_jsonl(ROOT / "data/annotations/bench-holdout" / fn):
            pid = rec["paragraphId"]
            if pid in gold_pids:
                cat_matrix[pid][src] = abbrev(rec["label"]["content_category"])
                spec_matrix[pid][src] = rec["label"]["specificity_level"]

    return dict(cat_matrix), dict(spec_matrix)


def load_paragraphs(gold_pids: set[str]) -> dict[str, dict]:
    """Load paragraph text for gold PIDs."""
    paragraphs = {}
    for rec in load_jsonl(ROOT / "data/gold/paragraphs-holdout.jsonl"):
        if rec["id"] in gold_pids:
            paragraphs[rec["id"]] = rec
    return paragraphs


# ── Analysis helpers ───────────────────────────────────────────────────────────


def find_axis_paragraphs(
    cat_matrix: dict[str, dict[str, str]], a: str, b: str
) -> list[tuple[str, dict[str, str], int, int]]:
    """Find paragraphs where the primary disagreement is between categories a and b.

    Returns list of (pid, signals, count_a, count_b) sorted by disagreement strength.
    """
    results = []
    for pid, signals in cat_matrix.items():
        cats = list(signals.values())
        counts = Counter(cats)
        ca, cb = counts.get(a, 0), counts.get(b, 0)
        if ca >= 1 and cb >= 1 and ca + cb >= len(cats) * 0.5:
            # This paragraph has a meaningful split on this axis
            results.append((pid, signals, ca, cb))
    # Sort by how evenly split (closer to 50/50 = harder)
    results.sort(key=lambda x: -min(x[2], x[3]))
    return results


def truncate_text(text: str, max_chars: int = 200) -> str:
    if len(text) <= max_chars:
        return text
    return text[:max_chars].rstrip() + "..."


def source_order() -> list[str]:
    """Canonical order for displaying sources."""
    humans = [f"human:{n}" for n in ["Joey", "Anuj", "Aaryan", "Elisabeth", "Meghan", "Xander"]]
    stage1 = ["s1:gemini-flash", "s1:grok-fast", "s1:mimo-flash"]
    opus = ["opus"]
    bench = [BENCH_MODEL_SHORT[fn] for fn in BENCH_FILES]
    return humans + stage1 + opus + bench


def format_signal_breakdown(
    signals: dict[str, str], axis_cats: tuple[str, str]
) -> str:
    """Format which sources said which category."""
    a, b = axis_cats
    a_sources = []
    b_sources = []
    other_sources = []
    for src in source_order():
        if src not in signals:
            continue
        cat = signals[src]
        if cat == a:
            a_sources.append(src)
        elif cat == b:
            b_sources.append(src)
        else:
            other_sources.append(f"{src}={cat}")

    parts = [
        f"  {a} ({len(a_sources)}): {', '.join(a_sources)}",
        f"  {b} ({len(b_sources)}): {', '.join(b_sources)}",
    ]
    if other_sources:
        parts.append(f"  Other: {', '.join(other_sources)}")
    return "\n".join(parts)


def extract_keyword_frequencies(
    paragraphs: dict[str, dict],
    axis_pids: list[str],
    cat_matrix: dict[str, dict[str, str]],
    cat_a: str,
    cat_b: str,
) -> tuple[Counter, Counter, Counter]:
    """Extract keyword frequencies for paragraphs leaning toward cat_a vs cat_b."""
    # Keywords to look for (domain-relevant)
    all_keywords = [
        "board", "director", "committee", "audit", "oversee", "oversight",
        "ciso", "officer", "chief", "vp", "vice president", "manager",
        "manage", "manages", "managing", "management", "responsible",
        "program", "team", "department", "staff", "personnel",
        "report", "reports", "reporting", "brief", "briefing", "informed",
        "incident", "breach", "attack", "compromise", "unauthorized",
        "material", "immaterial", "not material", "no material",
        "strategy", "strategic", "integrate", "integration", "aligned",
        "risk", "assess", "assessment", "framework", "nist", "iso",
        "policy", "policies", "procedure", "procedures",
        "third party", "third-party", "vendor", "supplier", "service provider",
        "insurance", "cyber insurance",
        "training", "awareness", "employee",
        "monitor", "monitoring", "detect", "detection",
        "govern", "governance",
        "experience", "experienced", "background", "qualification", "expertise",
        "day-to-day", "daily", "operational",
        "enterprise", "enterprise-wide",
        "designate", "designated", "appoint", "appointed",
    ]

    lean_a_pids = []
    lean_b_pids = []
    for pid in axis_pids:
        signals = cat_matrix[pid]
        counts = Counter(signals.values())
        if counts.get(cat_a, 0) > counts.get(cat_b, 0):
            lean_a_pids.append(pid)
        elif counts.get(cat_b, 0) > counts.get(cat_a, 0):
            lean_b_pids.append(pid)

    def count_keywords(pids: list[str]) -> Counter:
        kw_counts = Counter()
        for pid in pids:
            if pid not in paragraphs:
                continue
            text_lower = paragraphs[pid]["text"].lower()
            for kw in all_keywords:
                if kw in text_lower:
                    kw_counts[kw] += 1
        return kw_counts

    freq_a = count_keywords(lean_a_pids)
    freq_b = count_keywords(lean_b_pids)
    freq_all = count_keywords(axis_pids)

    return freq_a, freq_b, freq_all


def analyze_human_vs_genai_splits(
    axis_pids: list[str],
    cat_matrix: dict[str, dict[str, str]],
    cat_a: str,
    cat_b: str,
) -> tuple[list[str], list[str]]:
    """Find cases where humans lean one way but GenAI leans the other."""
    human_a_genai_b = []  # humans say A, GenAI says B
    human_b_genai_a = []  # humans say B, GenAI says A

    human_prefixes = ["human:"]
    genai_prefixes = ["s1:", "opus", "bench:"]

    for pid in axis_pids:
        signals = cat_matrix[pid]
        human_cats = []
        genai_cats = []
        for src, cat in signals.items():
            if any(src.startswith(p) for p in human_prefixes):
                human_cats.append(cat)
            else:
                genai_cats.append(cat)

        human_a = sum(1 for c in human_cats if c == cat_a)
        human_b = sum(1 for c in human_cats if c == cat_b)
        genai_a = sum(1 for c in genai_cats if c == cat_a)
        genai_b = sum(1 for c in genai_cats if c == cat_b)

        if human_a > human_b and genai_b > genai_a:
            human_a_genai_b.append(pid)
        elif human_b > human_a and genai_a > genai_b:
            human_b_genai_a.append(pid)

    return human_a_genai_b, human_b_genai_a


# ── Main analysis ──────────────────────────────────────────────────────────────


def main():
    print("=" * 100)
    print("HARDEST CASES ANALYSIS: SEC CYBERSECURITY HOLDOUT DATASET")
    print("Examining disagreements across 13 annotation sources to inform codebook rulings")
    print("=" * 100)

    # Load data
    print("\nLoading data...")
    cat_matrix, spec_matrix = build_signal_matrix()
    gold_pids = set(cat_matrix.keys())
    paragraphs = load_paragraphs(gold_pids)
    print(f"  Loaded {len(gold_pids)} gold paragraphs with {len(source_order())} potential sources each")

    # Verify source coverage
    source_coverage = Counter()
    for pid in gold_pids:
        for src in cat_matrix[pid]:
            source_coverage[src] += 1
    print("\n  Source coverage:")
    for src in source_order():
        print(f"    {src}: {source_coverage.get(src, 0)} paragraphs")

    # ── Overall disagreement stats ─────────────────────────────────────────

    print("\n" + "=" * 100)
    print("OVERALL DISAGREEMENT STATISTICS")
    print("=" * 100)

    unanimous = 0
    near_unanimous = 0  # 1 dissenter
    split = 0
    for pid in gold_pids:
        cats = list(cat_matrix[pid].values())
        counts = Counter(cats)
        top = counts.most_common(1)[0][1]
        n = len(cats)
        if top == n:
            unanimous += 1
        elif top >= n - 1:
            near_unanimous += 1
        else:
            split += 1

    print(f"\n  Unanimous (all sources agree): {unanimous} ({unanimous/len(gold_pids)*100:.1f}%)")
    print(f"  Near-unanimous (1 dissenter):  {near_unanimous} ({near_unanimous/len(gold_pids)*100:.1f}%)")
    print(f"  Split (2+ dissenters):         {split} ({split/len(gold_pids)*100:.1f}%)")

    # Count all pairwise disagreement axes
    axis_counts = Counter()
    for pid in gold_pids:
        cats = list(cat_matrix[pid].values())
        unique = set(cats)
        if len(unique) >= 2:
            for c1 in unique:
                for c2 in unique:
                    if c1 < c2:
                        axis_counts[(c1, c2)] += 1

    print("\n  All disagreement axes (paragraph has at least 1 source saying each):")
    for (c1, c2), ct in axis_counts.most_common(30):
        print(f"    {c1} <-> {c2}: {ct} paragraphs")

    # ── Axis-specific analysis ─────────────────────────────────────────────

    all_axis_results = {}

    for cat_a, cat_b, axis_name in AXES:
        print("\n" + "=" * 100)
        print(f"AXIS: {axis_name}")
        print("=" * 100)

        axis_pids_data = find_axis_paragraphs(cat_matrix, cat_a, cat_b)
        axis_pids = [x[0] for x in axis_pids_data]
        all_axis_results[axis_name] = axis_pids

        print(f"\n  Paragraphs with primary {cat_a}/{cat_b} disagreement: {len(axis_pids)}")

        if not axis_pids:
            print("  No paragraphs found on this axis.")
            continue

        # ── Signal split statistics ────────────────────────────────────────

        # Count how the split goes (majority A vs majority B)
        majority_a = sum(1 for _, _, ca, cb in axis_pids_data if ca > cb)
        majority_b = sum(1 for _, _, ca, cb in axis_pids_data if cb > ca)
        tied = sum(1 for _, _, ca, cb in axis_pids_data if ca == cb)
        print(f"  Majority {cat_a}: {majority_a} | Majority {cat_b}: {majority_b} | Tied: {tied}")

        # ── Human vs GenAI splits ──────────────────────────────────────────

        human_a_genai_b, human_b_genai_a = analyze_human_vs_genai_splits(
            axis_pids, cat_matrix, cat_a, cat_b
        )
        print(f"\n  Human/GenAI disagreements:")
        print(f"    Humans say {cat_a}, GenAI says {cat_b}: {len(human_a_genai_b)}")
        print(f"    Humans say {cat_b}, GenAI says {cat_a}: {len(human_b_genai_a)}")

        # ── Representative examples ────────────────────────────────────────

        # Show hardest cases (most evenly split)
        n_examples = min(10, len(axis_pids_data))
        print(f"\n  {'─' * 90}")
        print(f"  TOP {n_examples} MOST CONTENTIOUS PARAGRAPHS")
        print(f"  {'─' * 90}")

        for i, (pid, signals, ca, cb) in enumerate(axis_pids_data[:n_examples]):
            para = paragraphs.get(pid, {})
            text = para.get("text", "[text not found]")
            company = para.get("companyName", "?")
            word_count = para.get("wordCount", "?")

            print(f"\n  [{i+1}] PID: {pid[:12]}...  Company: {company}")
            print(f"      Words: {word_count} | Split: {ca} say {cat_a}, {cb} say {cat_b}, {len(signals)-ca-cb} say other")
            print(f"      Text: {truncate_text(text, 250)}")
            print(format_signal_breakdown(signals, (cat_a, cat_b)))

        # ── Human-A / GenAI-B examples ─────────────────────────────────────

        if human_a_genai_b:
            print(f"\n  {'─' * 90}")
            print(f"  HUMANS SAY {cat_a}, GenAI SAYS {cat_b} (up to 5 examples)")
            print(f"  {'─' * 90}")
            for pid in human_a_genai_b[:5]:
                para = paragraphs.get(pid, {})
                text = para.get("text", "[text not found]")
                print(f"\n  PID: {pid[:12]}...")
                print(f"      Text: {truncate_text(text, 250)}")
                print(format_signal_breakdown(cat_matrix[pid], (cat_a, cat_b)))

        if human_b_genai_a:
            print(f"\n  {'─' * 90}")
            print(f"  HUMANS SAY {cat_b}, GenAI SAYS {cat_a} (up to 5 examples)")
            print(f"  {'─' * 90}")
            for pid in human_b_genai_a[:5]:
                para = paragraphs.get(pid, {})
                text = para.get("text", "[text not found]")
                print(f"\n  PID: {pid[:12]}...")
                print(f"      Text: {truncate_text(text, 250)}")
                print(format_signal_breakdown(cat_matrix[pid], (cat_a, cat_b)))

        # ── Keyword / linguistic patterns ──────────────────────────────────

        print(f"\n  {'─' * 90}")
        print(f"  LINGUISTIC PATTERNS")
        print(f"  {'─' * 90}")

        freq_a, freq_b, freq_all = extract_keyword_frequencies(
            paragraphs, axis_pids, cat_matrix, cat_a, cat_b
        )

        # Compute over-representation: keywords more common when majority says A vs B
        lean_a_ct = sum(
            1 for pid in axis_pids
            if Counter(cat_matrix[pid].values()).get(cat_a, 0) > Counter(cat_matrix[pid].values()).get(cat_b, 0)
        )
        lean_b_ct = sum(
            1 for pid in axis_pids
            if Counter(cat_matrix[pid].values()).get(cat_b, 0) > Counter(cat_matrix[pid].values()).get(cat_a, 0)
        )

        print(f"\n  Paragraphs leaning {cat_a}: {lean_a_ct} | leaning {cat_b}: {lean_b_ct}")

        # Show keywords sorted by differential
        all_kws = set(freq_a.keys()) | set(freq_b.keys())
        diffs = []
        for kw in all_kws:
            fa = freq_a.get(kw, 0)
            fb = freq_b.get(kw, 0)
            total = freq_all.get(kw, 0)
            if total < 3:
                continue
            # Normalize by group size
            rate_a = fa / max(lean_a_ct, 1)
            rate_b = fb / max(lean_b_ct, 1)
            diff = rate_a - rate_b
            diffs.append((kw, fa, fb, total, rate_a, rate_b, diff))

        diffs.sort(key=lambda x: -abs(x[6]))

        print(f"\n  Keywords by differential (rate in {cat_a}-leaning vs {cat_b}-leaning paragraphs):")
        print(f"  {'Keyword':<22} {'In '+cat_a:>8} {'In '+cat_b:>8} {'Total':>8} {'Rate '+cat_a:>10} {'Rate '+cat_b:>10} {'Diff':>8}")
        print(f"  {'─'*22} {'─'*8} {'─'*8} {'─'*8} {'─'*10} {'─'*10} {'─'*8}")
        for kw, fa, fb, total, ra, rb, diff in diffs[:25]:
            marker = f"<- {cat_a}" if diff > 0.05 else (f"<- {cat_b}" if diff < -0.05 else "")
            print(f"  {kw:<22} {fa:>8} {fb:>8} {total:>8} {ra:>10.2%} {rb:>10.2%} {diff:>+8.2%} {marker}")

    # ── Other notable axes ─────────────────────────────────────────────────

    print("\n" + "=" * 100)
    print("OTHER NOTABLE DISAGREEMENT AXES (10+ paragraphs)")
    print("=" * 100)

    primary_axis_set = {("BG", "MR"), ("MR", "BG"), ("MR", "RMP"), ("RMP", "MR"), ("N/O", "SI"), ("SI", "N/O")}

    other_axes = []
    for (c1, c2), ct in axis_counts.most_common():
        if (c1, c2) not in primary_axis_set and ct >= 10:
            other_axes.append((c1, c2, ct))

    if not other_axes:
        print("\n  No other axes with 10+ paragraphs.")
    else:
        for cat_a, cat_b, count in other_axes:
            print(f"\n  {'─' * 90}")
            print(f"  {cat_a} <-> {cat_b}: {count} paragraphs")
            print(f"  {'─' * 90}")

            axis_pids_data = find_axis_paragraphs(cat_matrix, cat_a, cat_b)
            # Show up to 5 examples
            for i, (pid, signals, ca, cb) in enumerate(axis_pids_data[:5]):
                para = paragraphs.get(pid, {})
                text = para.get("text", "[text not found]")
                print(f"\n  [{i+1}] {truncate_text(text, 200)}")
                print(f"      Split: {ca}x {cat_a}, {cb}x {cat_b}")
                print(format_signal_breakdown(signals, (cat_a, cat_b)))

    # ── Summary statistics ─────────────────────────────────────────────────

    print("\n" + "=" * 100)
    print("SUMMARY STATISTICS")
    print("=" * 100)

    # Per-axis counts
    print("\n  Paragraphs on each primary confusion axis:")
    for cat_a, cat_b, axis_name in AXES:
        axis_data = find_axis_paragraphs(cat_matrix, cat_a, cat_b)
        print(f"    {axis_name}: {len(axis_data)} paragraphs")

    # How many could potentially be resolved by keyword rules?
    print("\n  Keyword-resolvable estimate (paragraphs containing strong discriminator keywords):")

    mr_rmp_data = find_axis_paragraphs(cat_matrix, "MR", "RMP")
    mr_rmp_pids = [x[0] for x in mr_rmp_data]
    resolvable_mr_rmp = 0
    mr_keywords = {"ciso", "chief information security", "chief security", "vp", "vice president",
                    "officer", "director of", "head of", "reports to", "reporting to"}
    rmp_keywords = {"framework", "nist", "iso", "soc 2", "assessment", "penetration test",
                    "vulnerability scan", "audit", "tabletop"}
    for pid in mr_rmp_pids:
        text_lower = paragraphs.get(pid, {}).get("text", "").lower()
        has_mr = any(kw in text_lower for kw in mr_keywords)
        has_rmp = any(kw in text_lower for kw in rmp_keywords)
        if has_mr != has_rmp:  # One side but not the other
            resolvable_mr_rmp += 1
    print(f"    MR <-> RMP: {resolvable_mr_rmp}/{len(mr_rmp_pids)} have clear keyword signal ({resolvable_mr_rmp/max(len(mr_rmp_pids),1)*100:.0f}%)")

    bg_mr_data = find_axis_paragraphs(cat_matrix, "BG", "MR")
    bg_mr_pids = [x[0] for x in bg_mr_data]
    resolvable_bg_mr = 0
    bg_keywords = {"board", "director", "committee", "audit committee", "board of directors"}
    mr_only_keywords = {"ciso", "chief information security", "officer", "vp", "management",
                        "team", "department", "staff", "day-to-day", "operational"}
    for pid in bg_mr_pids:
        text_lower = paragraphs.get(pid, {}).get("text", "").lower()
        has_bg = any(kw in text_lower for kw in bg_keywords)
        has_mr_only = any(kw in text_lower for kw in mr_only_keywords)
        if has_bg and not has_mr_only:
            resolvable_bg_mr += 1
        elif has_mr_only and not has_bg:
            resolvable_bg_mr += 1
    print(f"    BG <-> MR: {resolvable_bg_mr}/{len(bg_mr_pids)} have clear keyword signal ({resolvable_bg_mr/max(len(bg_mr_pids),1)*100:.0f}%)")

    si_no_data = find_axis_paragraphs(cat_matrix, "SI", "N/O")
    si_no_pids = [x[0] for x in si_no_data]
    resolvable_si_no = 0
    si_keywords = {"incident", "breach", "attack", "compromise", "unauthorized access",
                   "ransomware", "malware", "phishing", "data loss", "disruption"}
    no_keywords = {"no material", "not material", "have not experienced", "no known",
                   "not aware of any", "not been subject"}
    for pid in si_no_pids:
        text_lower = paragraphs.get(pid, {}).get("text", "").lower()
        has_si = any(kw in text_lower for kw in si_keywords)
        has_no = any(kw in text_lower for kw in no_keywords)
        if has_no:
            resolvable_si_no += 1
        elif has_si and not has_no:
            resolvable_si_no += 1
    print(f"    SI <-> N/O: {resolvable_si_no}/{len(si_no_pids)} have clear keyword signal ({resolvable_si_no/max(len(si_no_pids),1)*100:.0f}%)")

    # ── Specificity disagreements on confused paragraphs ───────────────────

    print("\n" + "=" * 100)
    print("SPECIFICITY DISAGREEMENT ON CONFUSED PARAGRAPHS")
    print("=" * 100)

    for cat_a, cat_b, axis_name in AXES:
        axis_data = find_axis_paragraphs(cat_matrix, cat_a, cat_b)
        if not axis_data:
            continue
        spec_ranges = []
        for pid, signals, _, _ in axis_data:
            specs = list(spec_matrix.get(pid, {}).values())
            if specs:
                spec_ranges.append(max(specs) - min(specs))
        if spec_ranges:
            avg_range = np.mean(spec_ranges)
            print(f"\n  {axis_name}: avg specificity range = {avg_range:.2f} (0=agree, 3=max disagree)")
            range_dist = Counter(spec_ranges)
            for r in sorted(range_dist.keys()):
                print(f"    Range {r}: {range_dist[r]} paragraphs")

    # ── Recommended codebook rulings ───────────────────────────────────────

    print("\n" + "=" * 100)
    print("RECOMMENDED CODEBOOK RULINGS")
    print("=" * 100)

    print("""
  Based on the analysis above, the following rulings would resolve the most cases:

  RULING 1: MR vs RMP — "Named-role test"
  ──────────────────────────────────────────
  If the paragraph's PRIMARY subject is a named individual, titled role (CISO, VP,
  CTO, etc.), or a specific person's responsibilities/qualifications/experience,
  classify as MR. If the paragraph's PRIMARY subject is a process, program, system,
  or methodology (even if it mentions who runs it), classify as RMP.

  Disambiguator: Ask "Is this paragraph ABOUT a person/role, or ABOUT a process?"
  - "Our CISO oversees our cybersecurity program" → MR (about the CISO)
  - "Our cybersecurity program includes monitoring, led by the CISO" → RMP (about the program)

  RULING 2: BG vs MR — "Board-line test"
  ──────────────────────────────────────────
  If the paragraph describes oversight, reporting, or governance AT or ABOVE the
  board/committee level, classify as BG. If it describes responsibilities BELOW
  the board level (C-suite officers reporting TO the board, management teams,
  operational roles), classify as MR.

  Disambiguator: "Does this paragraph describe what the board/committee DOES,
  or what someone REPORTS TO the board?"
  - "The Audit Committee oversees cybersecurity risk" → BG
  - "The CISO reports quarterly to the Audit Committee" → BG (board's receiving mechanism)
  - "The CISO manages a team of security analysts" → MR

  Key edge case: When a paragraph describes BOTH board oversight AND management
  roles, classify by the paragraph's PRIMARY focus. If roughly equal, prefer BG
  when board action is the grammatical subject.

  RULING 3: SI vs N/O — "Negative-incident test"
  ──────────────────────────────────────────
  Negative incident statements ("we have not experienced any material cybersecurity
  incidents") should be classified as N/O, NOT as SI. SI requires disclosure of an
  ACTUAL incident that occurred. The mere mention of incidents in a negation context
  does not constitute incident disclosure.

  However: If the paragraph describes a SPECIFIC past incident (even if resolved or
  deemed immaterial), classify as SI. The test is: "Did something actually happen?"
  - "We have not experienced material incidents" → N/O
  - "In 2023, we experienced a ransomware attack that..." → SI
  - "We experienced incidents but none were material" → SI (something happened)
""")

    # ── Deep dive: the very hardest cases ──────────────────────────────────

    print("=" * 100)
    print("DEEP DIVE: PARAGRAPHS WITH MAXIMUM ENTROPY (4+ DISTINCT CATEGORIES)")
    print("=" * 100)

    high_entropy = []
    for pid in gold_pids:
        cats = list(cat_matrix[pid].values())
        n_unique = len(set(cats))
        if n_unique >= 4:
            high_entropy.append((pid, n_unique, Counter(cats)))

    high_entropy.sort(key=lambda x: -x[1])
    print(f"\n  {len(high_entropy)} paragraphs with 4+ distinct category labels")

    for i, (pid, n_unique, counts) in enumerate(high_entropy[:10]):
        para = paragraphs.get(pid, {})
        text = para.get("text", "[text not found]")
        print(f"\n  [{i+1}] PID: {pid[:12]}... ({n_unique} categories)")
        print(f"      Text: {truncate_text(text, 250)}")
        print(f"      Distribution: {dict(counts.most_common())}")
        # Show all sources
        for src in source_order():
            if src in cat_matrix[pid]:
                cat = cat_matrix[pid][src]
                spec = spec_matrix.get(pid, {}).get(src, "?")
                print(f"        {src:<25} {cat:<5} spec={spec}")

    # ── Per-source accuracy vs human majority ──────────────────────────────

    print("\n" + "=" * 100)
    print("GENAI SOURCE AGREEMENT WITH HUMAN MAJORITY (on axis-confused paragraphs only)")
    print("=" * 100)

    for cat_a, cat_b, axis_name in AXES:
        axis_data = find_axis_paragraphs(cat_matrix, cat_a, cat_b)
        if not axis_data:
            continue

        print(f"\n  {axis_name} ({len(axis_data)} paragraphs):")

        # For each paragraph, determine human majority
        genai_sources = [s for s in source_order() if not s.startswith("human:")]
        source_agree = {s: 0 for s in genai_sources}
        source_total = {s: 0 for s in genai_sources}

        for pid, signals, _, _ in axis_data:
            # Human majority on this axis
            human_cats = [
                signals[s] for s in signals
                if s.startswith("human:") and signals[s] in (cat_a, cat_b)
            ]
            if not human_cats:
                continue
            human_majority = Counter(human_cats).most_common(1)[0][0]

            for src in genai_sources:
                if src in signals:
                    source_total[src] += 1
                    if signals[src] == human_majority:
                        source_agree[src] += 1

        print(f"    {'Source':<25} {'Agree':>8} {'Total':>8} {'Rate':>8}")
        print(f"    {'─'*25} {'─'*8} {'─'*8} {'─'*8}")
        for src in genai_sources:
            total = source_total[src]
            agree = source_agree[src]
            rate = agree / max(total, 1)
            print(f"    {src:<25} {agree:>8} {total:>8} {rate:>8.1%}")

    print("\n" + "=" * 100)
    print("END OF ANALYSIS")
    print("=" * 100)


if __name__ == "__main__":
    main()