SEC-cyBERT/scripts/audit-stage1-labels.py

"""
Audit Stage 1 annotations for systematic SI↔N/O miscoding.

Stage 1 used prompt v2.5 which lacked the rule "materiality disclaimers → SI."
This script quantifies how many N/O labels likely should have been SI, plus
other potential miscoding axes.

Run: uv run --with numpy scripts/audit-stage1-labels.py
"""

import json
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path

# ── Paths ──────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent
ANNOTATIONS = ROOT / "data" / "annotations" / "stage1.patched.jsonl"
PARAGRAPHS = ROOT / "data" / "paragraphs" / "paragraphs-clean.patched.jsonl"
PARAGRAPHS_FALLBACK = ROOT / "data" / "paragraphs" / "paragraphs-clean.jsonl"
HOLDOUT = ROOT / "data" / "gold" / "paragraphs-holdout.jsonl"
HUMAN_LABELS = ROOT / "data" / "gold" / "human-labels-raw.jsonl"


# ── Materiality regex patterns ─────────────────────────────────────────
# Pattern 1: "material" near business/strategy language (within ~15 words)
PAT_MATERIAL_NEAR_BIZ = re.compile(
    r"material(?:ly)?\b.{0,100}\b(?:business\s+strategy|results\s+of\s+operations|financial\s+condition|business|operations)"
    r"|"
    r"(?:business\s+strategy|results\s+of\s+operations|financial\s+condition)\b.{0,100}\baterial(?:ly)?",
    re.IGNORECASE,
)

# Pattern 2: specific materiality disclaimer phrases
PAT_MATERIALITY_DISCLAIMER = re.compile(
    r"have\s+not\s+materially\s+affected"
    r"|has\s+not\s+materially\s+affected"
    r"|could\s+materially\s+affect"
    r"|could\s+have\s+a\s+material\s+(?:adverse\s+)?(?:effect|impact)"
    r"|may\s+(?:materially|have\s+a\s+material)\s+(?:adverse\s+)?(?:effect|impact|affect)"
    r"|reasonably\s+likely\s+to\s+materially\s+affect"
    r"|not\s+reasonably\s+likely"
    r"|materially\s+(?:adverse(?:ly)?|impact|affect)"
    r"|material\s+adverse\s+(?:effect|impact)"
    r"|no\s+material\s+(?:adverse\s+)?(?:effect|impact)"
    r"|did\s+not\s+(?:have\s+a\s+)?material(?:ly)?\s+(?:adverse\s+)?(?:effect|impact|affect)",
    re.IGNORECASE,
)

# Pattern 3: explicit SI-relevant phrases
PAT_SI_PHRASES = re.compile(
    r"business\s+strategy"
    r"|results\s+of\s+operations"
    r"|financial\s+condition"
    r"|integrated\s+(?:into|with)\s+(?:our\s+)?(?:overall|business)"
    r"|part\s+of\s+(?:our\s+)?(?:overall|broader)\s+(?:risk|enterprise|business)",
    re.IGNORECASE,
)


def has_materiality_language(text: str) -> bool:
    """Returns True if text contains materiality-related language indicative of SI."""
    return bool(
        PAT_MATERIALITY_DISCLAIMER.search(text)
        or PAT_SI_PHRASES.search(text)
        or PAT_MATERIAL_NEAR_BIZ.search(text)
    )


# ── Insurance / budget / incident patterns ─────────────────────────────
PAT_INSURANCE = re.compile(r"\binsurance\b", re.IGNORECASE)
PAT_BUDGET = re.compile(r"\b(?:budget|investment(?:s)?)\b", re.IGNORECASE)
PAT_INCIDENT = re.compile(
    r"\bwe\s+(?:experienced|suffered|detected|identified|discovered|encountered|were\s+subject\s+to)\b",
    re.IGNORECASE,
)

# ── Cross-category confusion patterns ──────────────────────────────────
PAT_PROGRAM_FRAMEWORK = re.compile(
    r"\b(?:program|framework|process(?:es)?|procedure(?:s)?)\b", re.IGNORECASE
)
PAT_TITLE = re.compile(
    r"\b(?:Chief\s+(?:Information|Technology|Executive|Financial|Security|Operating|Risk)\s+(?:Officer|Security\s+Officer))"
    r"|(?:CISO|CIO|CTO|CFO|CEO|COO|CRO)\b"
    r"|\b(?:Vice\s+President|Director|Senior\s+Vice\s+President|EVP|SVP)\b",
    re.IGNORECASE,
)
PAT_MANAGEMENT_OFFICERS = re.compile(
    r"\b(?:management|officer(?:s)?|executive(?:s)?|leader(?:s)?(?:hip)?)\b",
    re.IGNORECASE,
)


def separator(title: str) -> None:
    width = 80
    print()
    print("=" * width)
    print(f"  {title}")
    print("=" * width)


def print_example(idx: int, pid: str, text: str, extra: str = "") -> None:
    print(f"\n  [{idx}] paragraphId: {pid}")
    if extra:
        print(f"      {extra}")
    # Wrap text at ~100 chars for readability
    wrapped = text
    if len(wrapped) > 500:
        wrapped = wrapped[:500] + "..."
    print(f"      TEXT: {wrapped}")


# ── Load data ──────────────────────────────────────────────────────────
def load_annotations() -> dict[str, list[dict]]:
    """Returns {paragraphId: [annotation, ...]}"""
    by_para: dict[str, list[dict]] = defaultdict(list)
    with open(ANNOTATIONS) as f:
        for line in f:
            d = json.loads(line)
            pid = d["paragraphId"]
            cat = d["label"]["content_category"]
            model = d["provenance"]["modelId"]
            by_para[pid].append({"category": cat, "model": model})
    return dict(by_para)


def load_paragraphs() -> dict[str, str]:
    """Returns {paragraphId: text}"""
    texts: dict[str, str] = {}
    path = PARAGRAPHS if PARAGRAPHS.exists() else PARAGRAPHS_FALLBACK
    with open(path) as f:
        for line in f:
            d = json.loads(line)
            texts[d["id"]] = d["text"]
    return texts


def load_holdout() -> dict[str, dict]:
    """Returns {paragraphId: {text, stage1Category, stage1Method, ...}}"""
    holdout: dict[str, dict] = {}
    with open(HOLDOUT) as f:
        for line in f:
            d = json.loads(line)
            holdout[d["id"]] = d
    return holdout


def load_human_labels() -> dict[str, list[dict]]:
    """Returns {paragraphId: [{annotatorName, contentCategory}, ...]}"""
    labels: dict[str, list[dict]] = defaultdict(list)
    with open(HUMAN_LABELS) as f:
        for line in f:
            d = json.loads(line)
            labels[d["paragraphId"]].append(
                {
                    "annotator": d["annotatorName"],
                    "category": d["contentCategory"],
                    "specificity": d["specificityLevel"],
                }
            )
    return dict(labels)


def main() -> None:
    print("Loading data...")
    annotations = load_annotations()
    texts = load_paragraphs()
    holdout = load_holdout()
    human_labels = load_human_labels()

    print(f"  Annotations: {sum(len(v) for v in annotations.values())} across {len(annotations)} paragraphs")
    print(f"  Paragraph texts loaded: {len(texts)}")
    print(f"  Holdout paragraphs: {len(holdout)}")
    print(f"  Human-labeled paragraphs: {len(human_labels)}")

    # ── Classify each paragraph by voting ──────────────────────────────
    unanimous_no: list[str] = []
    majority_no: list[str] = []  # 2/3 N/O
    unanimous_si: list[str] = []
    unanimous_mr: list[str] = []
    unanimous_rmp: list[str] = []
    unanimous_bg: list[str] = []
    all_unanimous: dict[str, str] = {}  # pid -> category for unanimous

    for pid, anns in annotations.items():
        cats = [a["category"] for a in anns]
        cat_counts = Counter(cats)

        if len(cats) != 3:
            continue  # skip incomplete

        if cat_counts.get("None/Other", 0) == 3:
            unanimous_no.append(pid)
            all_unanimous[pid] = "None/Other"
        elif cat_counts.get("None/Other", 0) == 2:
            majority_no.append(pid)
        elif cat_counts.get("Strategy Integration", 0) == 3:
            unanimous_si.append(pid)
            all_unanimous[pid] = "Strategy Integration"
        elif cat_counts.get("Management Role", 0) == 3:
            unanimous_mr.append(pid)
            all_unanimous[pid] = "Management Role"
        elif cat_counts.get("Risk Management Process", 0) == 3:
            unanimous_rmp.append(pid)
            all_unanimous[pid] = "Risk Management Process"
        elif cat_counts.get("Board Governance", 0) == 3:
            unanimous_bg.append(pid)
            all_unanimous[pid] = "Board Governance"

        # Track all unanimous
        if len(cat_counts) == 1:
            all_unanimous[pid] = cats[0]

    print(f"\n  Unanimous N/O: {len(unanimous_no)}")
    print(f"  Majority N/O (2/3): {len(majority_no)}")
    print(f"  Unanimous SI: {len(unanimous_si)}")
    print(f"  Unanimous MR: {len(unanimous_mr)}")
    print(f"  Unanimous RMP: {len(unanimous_rmp)}")
    print(f"  Unanimous BG: {len(unanimous_bg)}")
    print(f"  Total unanimous (any): {len(all_unanimous)}")

    # ════════════════════════════════════════════════════════════════════
    #  1. Unanimous N/O with materiality language
    # ════════════════════════════════════════════════════════════════════
    separator("1. UNANIMOUS N/O WITH MATERIALITY LANGUAGE")

    no_with_mat: list[tuple[str, str]] = []
    no_without_text = 0
    for pid in unanimous_no:
        text = texts.get(pid)
        if text is None:
            no_without_text += 1
            continue
        if has_materiality_language(text):
            no_with_mat.append((pid, text))

    print(f"\n  Total unanimous N/O: {len(unanimous_no)}")
    print(f"  Missing text: {no_without_text}")
    print(f"  With materiality language: {len(no_with_mat)}")
    print(f"  Percentage of unanimous N/O: {len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}%")

    print(f"\n  --- 10 representative examples ---")
    # Pick a diverse sample: take every Nth
    step = max(1, len(no_with_mat) // 10)
    shown = 0
    for i in range(0, len(no_with_mat), step):
        if shown >= 10:
            break
        pid, text = no_with_mat[i]
        print_example(shown + 1, pid, text)
        shown += 1

    # ════════════════════════════════════════════════════════════════════
    #  2. Majority N/O with materiality language
    # ════════════════════════════════════════════════════════════════════
    separator("2. MAJORITY N/O (2/3) WITH MATERIALITY LANGUAGE")

    maj_no_with_mat: list[tuple[str, str, str, str]] = []  # pid, text, dissenting_model, dissenting_cat
    for pid in majority_no:
        text = texts.get(pid)
        if text is None:
            continue
        if has_materiality_language(text):
            anns = annotations[pid]
            for a in anns:
                if a["category"] != "None/Other":
                    maj_no_with_mat.append((pid, text, a["model"], a["category"]))
                    break

    print(f"\n  Total majority N/O (2/3): {len(majority_no)}")
    print(f"  With materiality language: {len(maj_no_with_mat)}")
    print(f"  Percentage: {len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}%")

    # Count dissenting categories
    dissent_cats = Counter(x[3] for x in maj_no_with_mat)
    print(f"\n  Dissenting model voted:")
    for cat, cnt in dissent_cats.most_common():
        print(f"    {cat}: {cnt}")

    # Count dissenting models
    dissent_models = Counter(x[2] for x in maj_no_with_mat)
    print(f"\n  Which models dissented:")
    for model, cnt in dissent_models.most_common():
        print(f"    {model}: {cnt}")

    print(f"\n  --- 5 examples ---")
    step = max(1, len(maj_no_with_mat) // 5)
    shown = 0
    for i in range(0, len(maj_no_with_mat), step):
        if shown >= 5:
            break
        pid, text, model, cat = maj_no_with_mat[i]
        print_example(shown + 1, pid, text, f"Dissent: {model} → {cat}")
        shown += 1

    # ════════════════════════════════════════════════════════════════════
    #  3. Unanimous SI examples (contrast)
    # ════════════════════════════════════════════════════════════════════
    separator("3. UNANIMOUS SI — WHAT CLEAN SI LOOKS LIKE")

    si_examples: list[tuple[str, str]] = []
    for pid in unanimous_si:
        text = texts.get(pid)
        if text:
            si_examples.append((pid, text))
        if len(si_examples) >= 20:
            break

    print(f"\n  Total unanimous SI: {len(unanimous_si)}")
    print(f"\n  --- 5 examples ---")
    for i, (pid, text) in enumerate(si_examples[:5]):
        print_example(i + 1, pid, text)

    # Analyze SI language patterns
    si_has_materiality = sum(1 for pid in unanimous_si if pid in texts and has_materiality_language(texts[pid]))
    si_has_insurance = sum(1 for pid in unanimous_si if pid in texts and PAT_INSURANCE.search(texts[pid]))
    si_has_budget = sum(1 for pid in unanimous_si if pid in texts and PAT_BUDGET.search(texts[pid]))
    print(f"\n  SI language patterns:")
    print(f"    With materiality language: {si_has_materiality} / {len(unanimous_si)} ({si_has_materiality / max(1, len(unanimous_si)) * 100:.1f}%)")
    print(f"    Mention insurance: {si_has_insurance} / {len(unanimous_si)}")
    print(f"    Mention budget/investment: {si_has_budget} / {len(unanimous_si)}")

    # ════════════════════════════════════════════════════════════════════
    #  4. N/O with other potential miscoding
    # ════════════════════════════════════════════════════════════════════
    separator("4. N/O PARAGRAPHS WITH OTHER POTENTIAL MISCODING")

    no_insurance: list[tuple[str, str]] = []
    no_budget: list[tuple[str, str]] = []
    no_incident: list[tuple[str, str]] = []

    for pid in unanimous_no:
        text = texts.get(pid)
        if text is None:
            continue
        if PAT_INSURANCE.search(text):
            no_insurance.append((pid, text))
        if PAT_BUDGET.search(text):
            no_budget.append((pid, text))
        if PAT_INCIDENT.search(text):
            no_incident.append((pid, text))

    print(f"\n  Unanimous N/O mentioning insurance: {len(no_insurance)}")
    print(f"  Unanimous N/O mentioning budget/investment: {len(no_budget)}")
    print(f"  Unanimous N/O mentioning incidents ('we experienced...'): {len(no_incident)}")

    # Show examples for each
    print(f"\n  --- Insurance examples (up to 3) ---")
    for i, (pid, text) in enumerate(no_insurance[:3]):
        print_example(i + 1, pid, text)

    print(f"\n  --- Budget/investment examples (up to 3) ---")
    for i, (pid, text) in enumerate(no_budget[:3]):
        print_example(i + 1, pid, text)

    print(f"\n  --- Incident examples (up to 3) ---")
    for i, (pid, text) in enumerate(no_incident[:3]):
        print_example(i + 1, pid, text)

    # ════════════════════════════════════════════════════════════════════
    #  5. Scale the problem
    # ════════════════════════════════════════════════════════════════════
    separator("5. SCALE THE PROBLEM")

    # Deduplicate: some paragraphs may hit multiple patterns
    no_any_miscoded = set()
    for pid, _ in no_with_mat:
        no_any_miscoded.add(pid)
    for pid, _ in no_insurance:
        no_any_miscoded.add(pid)
    for pid, _ in no_budget:
        no_any_miscoded.add(pid)
    no_incident_pids = set(pid for pid, _ in no_incident)

    # Materiality-only (not already insurance/budget)
    mat_only = set(pid for pid, _ in no_with_mat)
    ins_only = set(pid for pid, _ in no_insurance) - mat_only
    bud_only = set(pid for pid, _ in no_budget) - mat_only - ins_only

    total_unanimous = len(all_unanimous)
    total_annotations = len(annotations)

    print(f"\n  Total paragraphs with 3 annotations: {total_annotations}")
    print(f"  Total unanimous (any category): {total_unanimous}")
    print(f"  Total unanimous N/O: {len(unanimous_no)}")
    print()
    print(f"  Potentially miscoded unanimous N/O:")
    print(f"    Materiality language (likely SI): {len(no_with_mat)}")
    print(f"    Insurance (likely SI): {len(no_insurance)}")
    print(f"    Budget/investment (likely SI): {len(no_budget)}")
    print(f"    Incident language (likely SI or ID): {len(no_incident)}")
    print(f"    Any of above (deduplicated): {len(no_any_miscoded)}")
    print(f"    Incident (separate concern): {len(no_incident_pids)}")
    print()

    # Overlap analysis
    mat_set = set(pid for pid, _ in no_with_mat)
    ins_set = set(pid for pid, _ in no_insurance)
    bud_set = set(pid for pid, _ in no_budget)
    print(f"  Overlap analysis:")
    print(f"    Materiality ∩ Insurance: {len(mat_set & ins_set)}")
    print(f"    Materiality ∩ Budget: {len(mat_set & bud_set)}")
    print(f"    Insurance ∩ Budget: {len(ins_set & bud_set)}")
    print()

    pct_no_affected = len(no_any_miscoded) / max(1, len(unanimous_no)) * 100
    pct_total_affected = len(no_any_miscoded) / max(1, total_unanimous) * 100
    pct_all_affected = len(no_any_miscoded) / max(1, total_annotations) * 100

    print(f"  Impact estimates:")
    print(f"    % of unanimous N/O potentially miscoded: {pct_no_affected:.1f}%")
    print(f"    % of all unanimous labels affected: {pct_total_affected:.1f}%")
    print(f"    % of all paragraphs affected: {pct_all_affected:.1f}%")

    # Also check majority N/O
    maj_no_any = set()
    for pid in majority_no:
        text = texts.get(pid)
        if text is None:
            continue
        if has_materiality_language(text) or PAT_INSURANCE.search(text) or PAT_BUDGET.search(text):
            maj_no_any.add(pid)

    print(f"\n  Majority N/O (2/3) potentially miscoded: {len(maj_no_any)} / {len(majority_no)}")
    print(f"  Combined (unanimous + majority) potentially miscoded N/O: {len(no_any_miscoded) + len(maj_no_any)}")

    # ════════════════════════════════════════════════════════════════════
    #  6. Cross-check with holdout / human labels
    # ════════════════════════════════════════════════════════════════════
    separator("6. HOLDOUT CROSS-CHECK WITH HUMAN LABELS")

    # Find holdout paragraphs that Stage 1 unanimously called N/O but contain materiality language
    holdout_no_mat: list[tuple[str, str]] = []
    holdout_no_mat_with_human: list[tuple[str, str, list[dict]]] = []

    for pid, para in holdout.items():
        if para.get("stage1Category") == "None/Other" and para.get("stage1Method") == "unanimous":
            text = para["text"]
            if has_materiality_language(text):
                holdout_no_mat.append((pid, text))
                if pid in human_labels:
                    holdout_no_mat_with_human.append((pid, text, human_labels[pid]))

    print(f"\n  Holdout paragraphs with stage1 unanimous N/O: "
          f"{sum(1 for p in holdout.values() if p.get('stage1Category') == 'None/Other' and p.get('stage1Method') == 'unanimous')}")
    print(f"  Of those, with materiality language: {len(holdout_no_mat)}")
    print(f"  Of those, with human labels: {len(holdout_no_mat_with_human)}")

    # What did humans call these?
    if holdout_no_mat_with_human:
        human_cats_for_flagged = Counter()
        for pid, text, hlabels in holdout_no_mat_with_human:
            for hl in hlabels:
                human_cats_for_flagged[hl["category"]] += 1

        print(f"\n  Human labels for flagged paragraphs (Stage1=unanimous N/O, has materiality language):")
        total_human = sum(human_cats_for_flagged.values())
        for cat, cnt in human_cats_for_flagged.most_common():
            print(f"    {cat}: {cnt} ({cnt / total_human * 100:.1f}%)")

        print(f"\n  --- Examples where humans disagreed with Stage 1 N/O ---")
        shown = 0
        for pid, text, hlabels in holdout_no_mat_with_human:
            non_no = [hl for hl in hlabels if hl["category"] != "None/Other"]
            if non_no:
                human_str = ", ".join(f"{hl['annotator']}={hl['category']}" for hl in hlabels)
                print_example(shown + 1, pid, text, f"Human labels: {human_str}")
                shown += 1
                if shown >= 5:
                    break

        # Also show ones where humans agreed it IS N/O
        print(f"\n  --- Examples where humans also said N/O (materiality language is ambiguous) ---")
        shown = 0
        for pid, text, hlabels in holdout_no_mat_with_human:
            all_no = all(hl["category"] == "None/Other" for hl in hlabels)
            if all_no and len(hlabels) >= 2:
                print_example(shown + 1, pid, text, "All humans agreed: N/O")
                shown += 1
                if shown >= 3:
                    break
    else:
        print("\n  No human labels available for flagged holdout paragraphs.")

    # Broader holdout analysis: all cases where Stage 1 said N/O but humans said something else
    separator("6b. HOLDOUT: ALL Stage1=N/O vs HUMAN DISAGREEMENTS")

    holdout_no_all = [pid for pid, p in holdout.items()
                      if p.get("stage1Category") == "None/Other"]
    stage1_no_human_disagree = []
    for pid in holdout_no_all:
        if pid in human_labels:
            hlabels = human_labels[pid]
            non_no = [hl for hl in hlabels if hl["category"] != "None/Other"]
            if non_no:
                stage1_no_human_disagree.append((pid, holdout[pid]["text"], hlabels))

    print(f"\n  All holdout paragraphs with Stage1=N/O (any method): {len(holdout_no_all)}")
    print(f"  Of those with human labels that disagree: {len(stage1_no_human_disagree)}")

    if stage1_no_human_disagree:
        # What did humans call them?
        human_override = Counter()
        for pid, text, hlabels in stage1_no_human_disagree:
            for hl in hlabels:
                if hl["category"] != "None/Other":
                    human_override[hl["category"]] += 1
        print(f"\n  Humans' non-N/O labels for Stage1=N/O paragraphs:")
        for cat, cnt in human_override.most_common():
            print(f"    {cat}: {cnt}")

    # ════════════════════════════════════════════════════════════════════
    #  7. Other confusion axes
    # ════════════════════════════════════════════════════════════════════
    separator("7. OTHER CONFUSION AXES IN STAGE 1")

    # 7a. Unanimous MR with program/framework/process language (potential RMP)
    mr_with_process = []
    for pid in unanimous_mr:
        text = texts.get(pid)
        if text is None:
            continue
        matches = PAT_PROGRAM_FRAMEWORK.findall(text)
        if len(matches) >= 2:  # Multiple mentions = likely process-focused
            mr_with_process.append((pid, text, matches))

    print(f"\n  7a. Unanimous MR with prominent program/framework/process language")
    print(f"      (>=2 mentions — potentially should be RMP)")
    print(f"      Count: {len(mr_with_process)} / {len(unanimous_mr)} ({len(mr_with_process) / max(1, len(unanimous_mr)) * 100:.1f}%)")
    print(f"\n  --- 3 examples ---")
    for i, (pid, text, matches) in enumerate(mr_with_process[:3]):
        print_example(i + 1, pid, text, f"Pattern matches: {matches[:6]}")

    # 7b. Unanimous RMP with specific titles (potential MR)
    rmp_with_titles = []
    for pid in unanimous_rmp:
        text = texts.get(pid)
        if text is None:
            continue
        titles = PAT_TITLE.findall(text)
        if titles:
            rmp_with_titles.append((pid, text, titles))

    print(f"\n  7b. Unanimous RMP mentioning specific people/titles")
    print(f"      (potentially should be MR)")
    print(f"      Count: {len(rmp_with_titles)} / {len(unanimous_rmp)} ({len(rmp_with_titles) / max(1, len(unanimous_rmp)) * 100:.1f}%)")
    print(f"\n  --- 3 examples ---")
    for i, (pid, text, titles) in enumerate(rmp_with_titles[:3]):
        print_example(i + 1, pid, text, f"Titles found: {titles[:5]}")

    # 7c. Unanimous BG primarily about management officers
    bg_about_mgmt = []
    for pid in unanimous_bg:
        text = texts.get(pid)
        if text is None:
            continue
        has_titles = PAT_TITLE.findall(text)
        has_mgmt = PAT_MANAGEMENT_OFFICERS.findall(text)
        # If it has management language but no board language
        board_pattern = re.compile(r"\b(?:board|director(?:s)?|committee|audit)\b", re.IGNORECASE)
        has_board = board_pattern.findall(text)
        if (has_titles or has_mgmt) and not has_board:
            bg_about_mgmt.append((pid, text, has_titles + has_mgmt))

    print(f"\n  7c. Unanimous BG primarily about management (no board/committee language)")
    print(f"      Count: {len(bg_about_mgmt)} / {len(unanimous_bg)} ({len(bg_about_mgmt) / max(1, len(unanimous_bg)) * 100:.1f}%)")
    if bg_about_mgmt:
        print(f"\n  --- 3 examples ---")
        for i, (pid, text, matches) in enumerate(bg_about_mgmt[:3]):
            print_example(i + 1, pid, text, f"Matches: {matches[:5]}")

    # ════════════════════════════════════════════════════════════════════
    #  SUMMARY
    # ════════════════════════════════════════════════════════════════════
    separator("SUMMARY")

    print(f"""
  DATASET OVERVIEW
    Total paragraphs annotated (3 models each): {total_annotations:,}
    Total unanimous labels: {total_unanimous:,}
    Unanimous N/O: {len(unanimous_no):,}
    Majority N/O (2/3): {len(majority_no):,}

  PRIMARY CONCERN: N/O → SI MISCODING
    Unanimous N/O with materiality language: {len(no_with_mat):,} ({len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}% of unanimous N/O)
    Majority N/O with materiality language: {len(maj_no_with_mat):,} ({len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}% of majority N/O)
    Unanimous N/O with insurance: {len(no_insurance):,}
    Unanimous N/O with budget/investment: {len(no_budget):,}
    Unanimous N/O with incident language: {len(no_incident):,}
    Total potentially miscoded (deduplicated): {len(no_any_miscoded):,}

  IMPACT ON TRAINING SET
    % of unanimous N/O affected: {pct_no_affected:.1f}%
    % of all unanimous labels affected: {pct_total_affected:.1f}%
    % of all paragraphs affected: {pct_all_affected:.1f}%

  OTHER CONFUSION AXES
    MR ↔ RMP confusion (MR with process language): {len(mr_with_process):,} / {len(unanimous_mr):,}
    RMP ↔ MR confusion (RMP with titles): {len(rmp_with_titles):,} / {len(unanimous_rmp):,}
    BG about management (no board language): {len(bg_about_mgmt):,} / {len(unanimous_bg):,}

  HOLDOUT VALIDATION
    Stage1=unanimous N/O with materiality language: {len(holdout_no_mat):,}
    Of those with human labels: {len(holdout_no_mat_with_human):,}
""")

    if holdout_no_mat_with_human:
        human_cats_for_flagged = Counter()
        for pid, text, hlabels in holdout_no_mat_with_human:
            for hl in hlabels:
                human_cats_for_flagged[hl["category"]] += 1
        print("  HUMAN VALIDATION (flagged holdout paragraphs):")
        total_h = sum(human_cats_for_flagged.values())
        for cat, cnt in human_cats_for_flagged.most_common():
            print(f"    {cat}: {cnt} ({cnt / total_h * 100:.1f}%)")


if __name__ == "__main__":
    main()