SEC-cyBERT/scripts/examine-v35-errors.py

"""Examine specific paragraphs where v3.5 performed WORSE than v3.0 against human labels.

Focus on BG↔MR and MR↔RMP confusion axes.
"""

import json
import textwrap
from collections import Counter, defaultdict
from pathlib import Path

# ── Paths ──────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent

V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl"
V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl"

V30_BENCH = ROOT / "data/annotations/bench-holdout"
V35_BENCH = ROOT / "data/annotations/bench-holdout-v35"

HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl"
HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl"
PARAGRAPHS = ROOT / "data/gold/paragraphs-holdout.jsonl"

MODEL_FILES = [
    "opus.jsonl",
    "gpt-5.4.jsonl",
    "gemini-3.1-pro-preview.jsonl",
    "glm-5:exacto.jsonl",
    "kimi-k2.5.jsonl",
    "mimo-v2-pro:exacto.jsonl",
    "minimax-m2.7:exacto.jsonl",
]

MODEL_NAMES = [
    "Opus",
    "GPT-5.4",
    "Gemini",
    "GLM-5",
    "Kimi",
    "Mimo",
    "MiniMax",
]

# Models to EXCLUDE from majority calculation
EXCLUDED_FROM_MAJORITY = {"MiniMax"}

CAT_ABBREV = {
    "BG": "Board Governance",
    "MR": "Management Role",
    "RMP": "Risk Management Process",
    "SI": "Strategy Integration",
    "NO": "None/Other",
    "ID": "Incident Disclosure",
    "TPR": "Third-Party Risk",
}

ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()}


def abbrev(cat: str) -> str:
    return ABBREV_CAT.get(cat, cat)


def load_jsonl(path: Path) -> list[dict]:
    with open(path) as f:
        return [json.loads(line) for line in f if line.strip()]


def load_annotations(base_dir: Path, filename: str) -> dict[str, str]:
    """Load paragraphId → content_category mapping."""
    path = base_dir / filename
    records = load_jsonl(path)
    return {r["paragraphId"]: r["label"]["content_category"] for r in records}


def load_golden(path: Path) -> dict[str, str]:
    records = load_jsonl(path)
    return {r["paragraphId"]: r["label"]["content_category"] for r in records}


# ── Load all data ─────────────────────────────────────────────────────────────

print("Loading data...")

# Confusion axis metadata
meta_records = load_jsonl(HOLDOUT_META)
pid_axes: dict[str, list[str]] = {r["paragraphId"]: r["axes"] for r in meta_records}
all_pids = set(pid_axes.keys())

# Human labels: paragraphId → list of (annotator, category)
human_raw = load_jsonl(HUMAN_LABELS)
human_labels: dict[str, list[tuple[str, str]]] = defaultdict(list)
for r in human_raw:
    if r["paragraphId"] in all_pids:
        human_labels[r["paragraphId"]].append(
            (r["annotatorName"], r["contentCategory"])
        )


def human_majority(pid: str) -> str | None:
    """Return majority category from human annotators, or None if no data."""
    labels = human_labels.get(pid)
    if not labels:
        return None
    cats = [c for _, c in labels]
    counts = Counter(cats)
    top = counts.most_common(1)[0]
    return top[0]


# Paragraph text
para_records = load_jsonl(PARAGRAPHS)
para_text: dict[str, str] = {r["id"]: r["text"] for r in para_records}

# v3.0 signals: model_idx → {pid: category}
v30_signals: list[dict[str, str]] = []
for fname in MODEL_FILES:
    if fname == "opus.jsonl":
        v30_signals.append(load_golden(V30_GOLDEN))
    else:
        v30_signals.append(load_annotations(V30_BENCH, fname))

# v3.5 signals
v35_signals: list[dict[str, str]] = []
for fname in MODEL_FILES:
    if fname == "opus.jsonl":
        v35_signals.append(load_golden(V35_GOLDEN))
    else:
        v35_signals.append(load_annotations(V35_BENCH, fname))


def get_signals(signals: list[dict[str, str]], pid: str) -> list[str | None]:
    """Get category from each model for a paragraph."""
    return [s.get(pid) for s in signals]


def majority_vote(signals: list[str | None], exclude_minimax: bool = True) -> str | None:
    """Compute majority from 6 models (excluding minimax which is index 6)."""
    cats = []
    for i, s in enumerate(signals):
        if s is None:
            continue
        if exclude_minimax and MODEL_NAMES[i] in EXCLUDED_FROM_MAJORITY:
            continue
        cats.append(s)
    if not cats:
        return None
    counts = Counter(cats)
    return counts.most_common(1)[0][0]


def unanimity_score(signals: list[str | None], exclude_minimax: bool = True) -> float:
    """Fraction of models agreeing with majority (0-1)."""
    cats = []
    for i, s in enumerate(signals):
        if s is None:
            continue
        if exclude_minimax and MODEL_NAMES[i] in EXCLUDED_FROM_MAJORITY:
            continue
        cats.append(s)
    if not cats:
        return 0.0
    counts = Counter(cats)
    top_count = counts.most_common(1)[0][1]
    return top_count / len(cats)


def format_signals(signals: list[str | None]) -> str:
    """Compact model signal display."""
    parts = []
    for name, cat in zip(MODEL_NAMES, signals):
        if cat is None:
            parts.append(f"{name}=??")
        else:
            parts.append(f"{name}={abbrev(cat)}")
    return ", ".join(parts)


def wrap_text(text: str, width: int = 100) -> str:
    return "\n    ".join(textwrap.wrap(text, width=width))


def print_paragraph_analysis(
    pid: str,
    v30_sigs: list[str | None],
    v35_sigs: list[str | None],
    header: str = "",
):
    """Print detailed analysis for a single paragraph."""
    text = para_text.get(pid, "[TEXT NOT FOUND]")
    h_labels = human_labels.get(pid, [])
    h_maj = human_majority(pid)
    v30_maj = majority_vote(v30_sigs)
    v35_maj = majority_vote(v35_sigs)
    axes = pid_axes.get(pid, [])

    if header:
        print(f"\n{'─' * 110}")
        print(f"  {header}")
        print(f"{'─' * 110}")
    else:
        print(f"\n{'─' * 110}")

    print(f"  PID: {pid}")
    print(f"  Axes: {', '.join(axes)}")
    print(f"\n  TEXT:")
    print(f"    {wrap_text(text)}")

    print(f"\n  HUMAN VOTES:")
    for name, cat in h_labels:
        marker = " ✓" if cat == h_maj else ""
        print(f"    {name:12s} → {abbrev(cat):5s}{marker}")
    print(f"    Majority   → {abbrev(h_maj) if h_maj else '??'}")

    print(f"\n  v3.0 signals: {format_signals(v30_sigs)}")
    print(f"  v3.0 majority (excl. MiniMax): {abbrev(v30_maj) if v30_maj else '??'}")

    print(f"  v3.5 signals: {format_signals(v35_sigs)}")
    print(f"  v3.5 majority (excl. MiniMax): {abbrev(v35_maj) if v35_maj else '??'}")

    # What changed
    changed_models = []
    for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)):
        if old is not None and new is not None and old != new:
            changed_models.append(f"{MODEL_NAMES[i]}: {abbrev(old)}→{abbrev(new)}")
    if changed_models:
        print(f"\n  CHANGES: {', '.join(changed_models)}")

    correct_v30 = v30_maj == h_maj if v30_maj and h_maj else None
    correct_v35 = v35_maj == h_maj if v35_maj and h_maj else None
    print(
        f"  v3.0 {'CORRECT' if correct_v30 else 'WRONG'} | "
        f"v3.5 {'CORRECT' if correct_v35 else 'WRONG'}"
    )


# ══════════════════════════════════════════════════════════════════════════════
# SECTION 1: BG↔MR Regression Cases
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "═" * 110)
print("  SECTION 1: BG↔MR AXIS — REGRESSION CASES")
print("  (v3.0 matched human majority, but v3.5 does NOT)")
print("═" * 110)

bg_mr_pids = [pid for pid, axes in pid_axes.items() if "BG_MR" in axes]
print(f"\nTotal BG↔MR paragraphs: {len(bg_mr_pids)}")

# Filter to those with human labels
bg_mr_pids = [pid for pid in bg_mr_pids if human_majority(pid) is not None]
print(f"With human labels: {len(bg_mr_pids)}")

regressions_bg_mr = []
improvements_bg_mr = []
both_correct_bg_mr = []
both_wrong_bg_mr = []

for pid in bg_mr_pids:
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    v30_maj = majority_vote(v30_sigs)
    v35_maj = majority_vote(v35_sigs)
    h_maj = human_majority(pid)

    if v30_maj is None or v35_maj is None or h_maj is None:
        continue

    v30_correct = abbrev(v30_maj) == abbrev(h_maj)
    v35_correct = abbrev(v35_maj) == abbrev(h_maj)

    if v30_correct and not v35_correct:
        regressions_bg_mr.append(pid)
    elif not v30_correct and v35_correct:
        improvements_bg_mr.append(pid)
    elif v30_correct and v35_correct:
        both_correct_bg_mr.append(pid)
    else:
        both_wrong_bg_mr.append(pid)

print(f"\nBG↔MR Summary:")
print(f"  Both correct:        {len(both_correct_bg_mr)}")
print(f"  Both wrong:          {len(both_wrong_bg_mr)}")
print(f"  v3.0 correct → v3.5 WRONG (REGRESSIONS): {len(regressions_bg_mr)}")
print(f"  v3.0 wrong → v3.5 correct (IMPROVEMENTS): {len(improvements_bg_mr)}")

print(f"\n{'━' * 110}")
print(f"  BG↔MR REGRESSIONS (showing all, up to 20)")
print(f"{'━' * 110}")

for i, pid in enumerate(regressions_bg_mr[:20]):
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    print_paragraph_analysis(pid, v30_sigs, v35_sigs, f"REGRESSION #{i+1}")

# BG↔MR improvements
print(f"\n{'━' * 110}")
print(f"  BG↔MR IMPROVEMENTS (showing up to 5)")
print(f"{'━' * 110}")

for i, pid in enumerate(improvements_bg_mr[:5]):
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    print_paragraph_analysis(pid, v30_sigs, v35_sigs, f"IMPROVEMENT #{i+1}")


# ══════════════════════════════════════════════════════════════════════════════
# SECTION 2: MR↔RMP Non-Convergence Cases
# ══════════════════════════════════════════════════════════════════════════════

print("\n\n" + "═" * 110)
print("  SECTION 2: MR↔RMP AXIS — NON-CONVERGENCE AND REGRESSIONS")
print("═" * 110)

mr_rmp_pids = [pid for pid, axes in pid_axes.items() if "MR_RMP" in axes]
print(f"\nTotal MR↔RMP paragraphs: {len(mr_rmp_pids)}")
mr_rmp_pids = [pid for pid in mr_rmp_pids if human_majority(pid) is not None]
print(f"With human labels: {len(mr_rmp_pids)}")

# Find: less unanimous in v3.5 OR flipped away from human majority
non_convergence_mr_rmp = []
regressions_mr_rmp = []
improvements_mr_rmp = []

for pid in mr_rmp_pids:
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    v30_maj = majority_vote(v30_sigs)
    v35_maj = majority_vote(v35_sigs)
    h_maj = human_majority(pid)
    v30_unanimity = unanimity_score(v30_sigs)
    v35_unanimity = unanimity_score(v35_sigs)

    if v30_maj is None or v35_maj is None or h_maj is None:
        continue

    v30_correct = abbrev(v30_maj) == abbrev(h_maj)
    v35_correct = abbrev(v35_maj) == abbrev(h_maj)

    # Regression: was correct, now wrong
    if v30_correct and not v35_correct:
        regressions_mr_rmp.append((pid, v30_unanimity, v35_unanimity))

    # Non-convergence: less unanimous OR flipped away
    if v35_unanimity < v30_unanimity or (v30_correct and not v35_correct):
        non_convergence_mr_rmp.append((pid, v30_unanimity, v35_unanimity))

    if not v30_correct and v35_correct:
        improvements_mr_rmp.append((pid, v30_unanimity, v35_unanimity))

# Sort non-convergence by delta (worst first)
non_convergence_mr_rmp.sort(key=lambda x: x[1] - x[2], reverse=True)

print(f"\nMR↔RMP Summary:")
print(f"  Regressions (correct→wrong): {len(regressions_mr_rmp)}")
print(f"  Non-convergence (less unanimous or regressed): {len(non_convergence_mr_rmp)}")
print(f"  Improvements (wrong→correct): {len(improvements_mr_rmp)}")

print(f"\n{'━' * 110}")
print(f"  MR↔RMP NON-CONVERGENCE / REGRESSION CASES (showing 10)")
print(f"{'━' * 110}")

shown = set()
count = 0
for pid, v30_u, v35_u in non_convergence_mr_rmp:
    if count >= 10:
        break
    if pid in shown:
        continue
    shown.add(pid)
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    v30_maj = majority_vote(v30_sigs)
    v35_maj = majority_vote(v35_sigs)
    h_maj = human_majority(pid)
    label = "REGRESSION" if (abbrev(v30_maj) == abbrev(h_maj) and abbrev(v35_maj) != abbrev(h_maj)) else "LESS UNANIMOUS"
    print_paragraph_analysis(
        pid, v30_sigs, v35_sigs,
        f"{label} #{count+1} (unanimity: v3.0={v30_u:.0%} → v3.5={v35_u:.0%})"
    )
    count += 1

print(f"\n{'━' * 110}")
print(f"  MR↔RMP IMPROVEMENTS (showing up to 5)")
print(f"{'━' * 110}")

for i, (pid, v30_u, v35_u) in enumerate(improvements_mr_rmp[:5]):
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    print_paragraph_analysis(
        pid, v30_sigs, v35_sigs,
        f"IMPROVEMENT #{i+1} (unanimity: v3.0={v30_u:.0%} → v3.5={v35_u:.0%})"
    )


# ══════════════════════════════════════════════════════════════════════════════
# SECTION 3: Error Pattern Analysis
# ══════════════════════════════════════════════════════════════════════════════

print("\n\n" + "═" * 110)
print("  SECTION 3: ERROR PATTERN ANALYSIS")
print("═" * 110)

# ── BG↔MR regression patterns ───────────────────────────────────────────────
print(f"\n{'━' * 110}")
print(f"  3A: BG↔MR REGRESSION PATTERNS")
print(f"{'━' * 110}")

if regressions_bg_mr:
    # Analyze what the human majority is and what v3.5 switched to
    regression_directions = Counter()
    regression_model_flips = Counter()

    for pid in regressions_bg_mr:
        h_maj = human_majority(pid)
        v30_sigs = get_signals(v30_signals, pid)
        v35_sigs = get_signals(v35_signals, pid)
        v30_maj = majority_vote(v30_sigs)
        v35_maj = majority_vote(v35_sigs)
        direction = f"{abbrev(v30_maj)}→{abbrev(v35_maj)} (human={abbrev(h_maj)})"
        regression_directions[direction] += 1

        # Which models flipped?
        for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)):
            if old and new and old != new:
                regression_model_flips[MODEL_NAMES[i]] += 1

    print(f"\n  Regression directions (v3.0→v3.5, human ground truth):")
    for direction, count in regression_directions.most_common():
        print(f"    {direction}: {count}")

    print(f"\n  Models that flipped most on regressions:")
    for model, count in regression_model_flips.most_common():
        print(f"    {model}: {count} flips")

    # Text pattern analysis
    print(f"\n  Common textual signals in regression paragraphs:")
    signal_words = {
        "board": 0, "committee": 0, "oversee": 0, "oversight": 0,
        "report": 0, "director": 0, "officer": 0, "CISO": 0,
        "governance": 0, "responsible": 0, "qualif": 0, "experience": 0,
        "manage": 0, "program": 0, "framework": 0, "process": 0,
        "audit": 0,
    }
    for pid in regressions_bg_mr:
        text = para_text.get(pid, "").lower()
        for word in signal_words:
            if word.lower() in text:
                signal_words[word] += 1

    total_reg = len(regressions_bg_mr)
    for word, count in sorted(signal_words.items(), key=lambda x: -x[1]):
        if count > 0:
            print(f"    '{word}': {count}/{total_reg} ({count/total_reg:.0%})")

    # Check if humans are split on these
    print(f"\n  Human agreement on regressions:")
    unanimous_human = 0
    split_human = 0
    for pid in regressions_bg_mr:
        labels = human_labels.get(pid, [])
        cats = [c for _, c in labels]
        if len(set(cats)) == 1:
            unanimous_human += 1
        else:
            split_human += 1
    print(f"    Unanimous human: {unanimous_human}")
    print(f"    Split human (2-1): {split_human}")

    if split_human > 0:
        print(f"\n  Split-human regression details:")
        for pid in regressions_bg_mr:
            labels = human_labels.get(pid, [])
            cats = [c for _, c in labels]
            if len(set(cats)) > 1:
                votes = ", ".join(f"{n}={abbrev(c)}" for n, c in labels)
                print(f"    {pid[:12]}... → {votes}")
else:
    print("\n  No BG↔MR regressions found.")

# ── MR↔RMP patterns ─────────────────────────────────────────────────────────
print(f"\n{'━' * 110}")
print(f"  3B: MR↔RMP NON-CONVERGENCE PATTERNS")
print(f"{'━' * 110}")

if non_convergence_mr_rmp:
    # Regression directions
    nc_directions = Counter()
    nc_model_flips = Counter()

    for pid, _, _ in non_convergence_mr_rmp:
        h_maj = human_majority(pid)
        v30_sigs = get_signals(v30_signals, pid)
        v35_sigs = get_signals(v35_signals, pid)
        v30_maj = majority_vote(v30_sigs)
        v35_maj = majority_vote(v35_sigs)
        direction = f"{abbrev(v30_maj)}→{abbrev(v35_maj)} (human={abbrev(h_maj)})"
        nc_directions[direction] += 1

        for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)):
            if old and new and old != new:
                nc_model_flips[MODEL_NAMES[i]] += 1

    print(f"\n  Direction of non-convergent shifts:")
    for direction, count in nc_directions.most_common():
        print(f"    {direction}: {count}")

    print(f"\n  Models that flipped most:")
    for model, count in nc_model_flips.most_common():
        print(f"    {model}: {count} flips")

    # Text pattern analysis — compare what helped vs what didn't
    print(f"\n  Text signals in NON-CONVERGENT vs IMPROVED paragraphs:")

    keywords = ["CISO", "officer", "responsible", "oversee", "report",
                 "program", "framework", "qualif", "experience", "certif",
                 "manage", "assess", "monitor", "team", "director"]

    nc_pids_set = {pid for pid, _, _ in non_convergence_mr_rmp}
    imp_pids_set = {pid for pid, _, _ in improvements_mr_rmp}

    print(f"\n  {'Keyword':<16} {'Non-conv':>10} {'Improved':>10}")
    print(f"  {'─'*16} {'─'*10} {'─'*10}")
    for kw in keywords:
        nc_count = sum(1 for pid in nc_pids_set if kw.lower() in para_text.get(pid, "").lower())
        imp_count = sum(1 for pid in imp_pids_set if kw.lower() in para_text.get(pid, "").lower())
        nc_pct = f"{nc_count}/{len(nc_pids_set)}" if nc_pids_set else "0"
        imp_pct = f"{imp_count}/{len(imp_pids_set)}" if imp_pids_set else "0"
        print(f"  {kw:<16} {nc_pct:>10} {imp_pct:>10}")

    # Person-removal test analysis
    print(f"\n  Person-removal test applicability:")
    print(f"  Checking if regression paragraphs have person as ONLY subject...")
    for pid, _, _ in regressions_mr_rmp:
        text = para_text.get(pid, "")
        has_person_subject = any(
            marker in text.lower()
            for marker in ["ciso", "chief information", "chief technology",
                           "vice president", "director of", "officer"]
        )
        has_process_subject = any(
            marker in text.lower()
            for marker in ["program", "framework", "process", "system",
                           "controls", "policies", "procedures"]
        )
        h_maj = human_majority(pid)
        v35_maj = majority_vote(get_signals(v35_signals, pid))
        print(
            f"    {pid[:12]}... person_subj={has_person_subject} "
            f"process_subj={has_process_subject} "
            f"human={abbrev(h_maj)} v3.5={abbrev(v35_maj)}"
        )
else:
    print("\n  No MR↔RMP non-convergence cases found.")


# ══════════════════════════════════════════════════════════════════════════════
# SECTION 4: Ruling Recommendations
# ══════════════════════════════════════════════════════════════════════════════

print("\n\n" + "═" * 110)
print("  SECTION 4: RULING RECOMMENDATIONS")
print("═" * 110)

print("""
Based on the error analysis above, here are the specific ruling observations:

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  4A: BG↔MR Board-Line Test
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

CURRENT RULING (Rule 2):
  "When a paragraph spans layers (governance chain paragraphs): apply the
   dominant-subject test — which layer occupies the most sentence-subjects?"

  "Governance overview spanning board → committee → officer → program →
   Board Governance if the board/committee occupies more sentence-subjects;
   Management Role if the officer does; Risk Management Process if the
   program does"
""")

# Analyze the specific regressions to give targeted advice
if regressions_bg_mr:
    # Count what direction the regressions went
    bg_to_mr = sum(
        1 for pid in regressions_bg_mr
        if abbrev(majority_vote(get_signals(v35_signals, pid))) == "MR"
        and abbrev(human_majority(pid)) == "BG"
    )
    mr_to_bg = sum(
        1 for pid in regressions_bg_mr
        if abbrev(majority_vote(get_signals(v35_signals, pid))) == "BG"
        and abbrev(human_majority(pid)) == "MR"
    )
    other_dir = len(regressions_bg_mr) - bg_to_mr - mr_to_bg

    print(f"  EMPIRICAL FINDING:")
    print(f"    Regressions that moved BG→MR (human says BG): {bg_to_mr}")
    print(f"    Regressions that moved MR→BG (human says MR): {mr_to_bg}")
    print(f"    Other directions: {other_dir}")

    if bg_to_mr > mr_to_bg:
        print("""
  DIAGNOSIS: The dominant-subject test is OVER-CORRECTING toward MR.
  When a governance chain mentions a CISO or officer, models are counting that
  mention as a "sentence subject" even when the paragraph's primary purpose is
  describing the board/committee oversight structure.

  PROPOSED FIX — add a "purpose test" before the subject count:
    "Before counting sentence-subjects, ask: what is the paragraph's PRIMARY
     COMMUNICATIVE PURPOSE? If it is to describe the oversight/reporting
     structure (who oversees whom, what gets reported where), the paragraph
     is Board Governance even if individual officers are named as intermediaries.
     The dominant-subject count applies only when the paragraph's purpose is
     genuinely ambiguous between describing the oversight structure and
     describing the officer's role."

  Alternatively, add a carve-out:
    "A governance chain paragraph (board → committee → officer → program)
     defaults to Board Governance unless the officer section constitutes
     MORE THAN HALF the paragraph's content AND includes qualifications,
     credentials, or personal background."
""")
    elif mr_to_bg > bg_to_mr:
        print("""
  DIAGNOSIS: The dominant-subject test is OVER-CORRECTING toward BG.
  Paragraphs that are primarily about management roles are being pulled
  toward BG because they mention board oversight.

  PROPOSED FIX:
    "When a paragraph's primary content is about a management role (CISO,
     CIO, etc.) and mentions board oversight only as context for the
     reporting relationship, classify as Management Role. Board Governance
     requires the board/committee to be the PRIMARY ACTOR, not merely
     the recipient of reports."
""")

print("""
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  4B: MR↔RMP Three-Step Chain
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

CURRENT RULING (Rule 2b):
  "Step 1 — Subject test: What is the paragraph's grammatical subject?
   Step 2 — Person-removal test: Could you delete all named roles, titles,
   qualifications, experience descriptions, and credentials from the
   paragraph and still have a coherent cybersecurity disclosure?
   Step 3 — Qualifications tiebreaker: Does the paragraph include experience
   (years), certifications (CISSP, CISM), education, team size, or career
   history for named individuals?"
""")

if regressions_mr_rmp:
    mr_to_rmp = sum(
        1 for pid, _, _ in regressions_mr_rmp
        if abbrev(majority_vote(get_signals(v35_signals, pid))) == "RMP"
        and abbrev(human_majority(pid)) == "MR"
    )
    rmp_to_mr = sum(
        1 for pid, _, _ in regressions_mr_rmp
        if abbrev(majority_vote(get_signals(v35_signals, pid))) == "MR"
        and abbrev(human_majority(pid)) == "RMP"
    )

    print(f"  EMPIRICAL FINDING:")
    print(f"    Regressions that moved MR→RMP (human says MR): {mr_to_rmp}")
    print(f"    Regressions that moved RMP→MR (human says RMP): {rmp_to_mr}")

    if mr_to_rmp > rmp_to_mr:
        print("""
  DIAGNOSIS: The person-removal test is TOO AGGRESSIVE at removing people.
  When a paragraph describes a CISO's monitoring activities, the person-removal
  test says "yes, the monitoring process stands alone," but the HUMANS recognize
  that the paragraph is fundamentally about the management role's responsibilities.

  PROPOSED FIX — tighten the person-removal test:
    "Step 2 — Person-removal test: Delete all named roles AND their associated
     ACTIVITIES. If the paragraph still describes a cybersecurity process or
     framework, it is Risk Management Process. If deleting the roles and their
     activities leaves nothing substantive, it is Management Role.
     Key distinction: 'The CISO monitors threat intelligence' — removing the
     CISO removes the monitoring activity, so this is Management Role.
     'The company monitors threat intelligence under the direction of the CISO'
     — removing the CISO leaves the monitoring intact, so this is RMP."
""")
    elif rmp_to_mr > mr_to_rmp:
        print("""
  DIAGNOSIS: The three-step chain is UNDER-APPLYING the person-removal test.
  Models are stopping at Step 1 (subject test) when they see a role title,
  without proceeding to the person-removal test.

  PROPOSED FIX:
    "Step 1 should only produce a STRONG signal, not a decisive result.
     Always proceed to Step 2 unless the paragraph is ENTIRELY about
     a person's credentials with no process content whatsoever."
""")

if not regressions_mr_rmp:
    print("""
  No MR↔RMP regressions found. The three-step chain may be working correctly,
  or the non-convergence is increasing uncertainty without changing majority votes.
  Focus on whether the increased model disagreement reflects genuine ambiguity
  or whether the step instructions need to be more prescriptive.
""")

# ── Final summary stats ──────────────────────────────────────────────────────
print("\n" + "═" * 110)
print("  FINAL SUMMARY")
print("═" * 110)

# Overall accuracy comparison
total_with_human = 0
v30_correct_total = 0
v35_correct_total = 0

for pid in all_pids:
    h_maj = human_majority(pid)
    if h_maj is None:
        continue
    v30_sigs = get_signals(v30_signals, pid)
    v35_sigs = get_signals(v35_signals, pid)
    v30_maj = majority_vote(v30_sigs)
    v35_maj = majority_vote(v35_sigs)
    if v30_maj is None or v35_maj is None:
        continue
    total_with_human += 1
    if abbrev(v30_maj) == abbrev(h_maj):
        v30_correct_total += 1
    if abbrev(v35_maj) == abbrev(h_maj):
        v35_correct_total += 1

print(f"\n  Overall accuracy on {total_with_human} confusion-axis paragraphs:")
print(f"    v3.0: {v30_correct_total}/{total_with_human} ({v30_correct_total/total_with_human:.1%})")
print(f"    v3.5: {v35_correct_total}/{total_with_human} ({v35_correct_total/total_with_human:.1%})")
print(f"    Delta: {v35_correct_total - v30_correct_total:+d}")

# Per-axis breakdown
for axis_name in ["BG_MR", "MR_RMP", "BG_RMP", "SI_NO"]:
    axis_pids = [pid for pid, axes in pid_axes.items() if axis_name in axes]
    v30_c = 0
    v35_c = 0
    n = 0
    for pid in axis_pids:
        h_maj = human_majority(pid)
        if h_maj is None:
            continue
        v30_sigs = get_signals(v30_signals, pid)
        v35_sigs = get_signals(v35_signals, pid)
        v30_maj = majority_vote(v30_sigs)
        v35_maj = majority_vote(v35_sigs)
        if v30_maj is None or v35_maj is None:
            continue
        n += 1
        if abbrev(v30_maj) == abbrev(h_maj):
            v30_c += 1
        if abbrev(v35_maj) == abbrev(h_maj):
            v35_c += 1

    if n > 0:
        print(f"\n  {axis_name} ({n} paragraphs):")
        print(f"    v3.0: {v30_c}/{n} ({v30_c/n:.1%})")
        print(f"    v3.5: {v35_c}/{n} ({v35_c/n:.1%})")
        print(f"    Delta: {v35_c - v30_c:+d}")

print()