"""Examine specific paragraphs where v3.5 performed WORSE than v3.0 against human labels. Focus on BG↔MR and MR↔RMP confusion axes. """ import json import textwrap from collections import Counter, defaultdict from pathlib import Path # ── Paths ────────────────────────────────────────────────────────────────────── ROOT = Path(__file__).resolve().parent.parent V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl" V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl" V30_BENCH = ROOT / "data/annotations/bench-holdout" V35_BENCH = ROOT / "data/annotations/bench-holdout-v35" HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl" HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl" PARAGRAPHS = ROOT / "data/gold/paragraphs-holdout.jsonl" MODEL_FILES = [ "opus.jsonl", "gpt-5.4.jsonl", "gemini-3.1-pro-preview.jsonl", "glm-5:exacto.jsonl", "kimi-k2.5.jsonl", "mimo-v2-pro:exacto.jsonl", "minimax-m2.7:exacto.jsonl", ] MODEL_NAMES = [ "Opus", "GPT-5.4", "Gemini", "GLM-5", "Kimi", "Mimo", "MiniMax", ] # Models to EXCLUDE from majority calculation EXCLUDED_FROM_MAJORITY = {"MiniMax"} CAT_ABBREV = { "BG": "Board Governance", "MR": "Management Role", "RMP": "Risk Management Process", "SI": "Strategy Integration", "NO": "None/Other", "ID": "Incident Disclosure", "TPR": "Third-Party Risk", } ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()} def abbrev(cat: str) -> str: return ABBREV_CAT.get(cat, cat) def load_jsonl(path: Path) -> list[dict]: with open(path) as f: return [json.loads(line) for line in f if line.strip()] def load_annotations(base_dir: Path, filename: str) -> dict[str, str]: """Load paragraphId → content_category mapping.""" path = base_dir / filename records = load_jsonl(path) return {r["paragraphId"]: r["label"]["content_category"] for r in records} def load_golden(path: Path) -> dict[str, str]: records = load_jsonl(path) return {r["paragraphId"]: r["label"]["content_category"] for r in records} # ── Load all data ───────────────────────────────────────────────────────────── print("Loading data...") # Confusion axis metadata meta_records = load_jsonl(HOLDOUT_META) pid_axes: dict[str, list[str]] = {r["paragraphId"]: r["axes"] for r in meta_records} all_pids = set(pid_axes.keys()) # Human labels: paragraphId → list of (annotator, category) human_raw = load_jsonl(HUMAN_LABELS) human_labels: dict[str, list[tuple[str, str]]] = defaultdict(list) for r in human_raw: if r["paragraphId"] in all_pids: human_labels[r["paragraphId"]].append( (r["annotatorName"], r["contentCategory"]) ) def human_majority(pid: str) -> str | None: """Return majority category from human annotators, or None if no data.""" labels = human_labels.get(pid) if not labels: return None cats = [c for _, c in labels] counts = Counter(cats) top = counts.most_common(1)[0] return top[0] # Paragraph text para_records = load_jsonl(PARAGRAPHS) para_text: dict[str, str] = {r["id"]: r["text"] for r in para_records} # v3.0 signals: model_idx → {pid: category} v30_signals: list[dict[str, str]] = [] for fname in MODEL_FILES: if fname == "opus.jsonl": v30_signals.append(load_golden(V30_GOLDEN)) else: v30_signals.append(load_annotations(V30_BENCH, fname)) # v3.5 signals v35_signals: list[dict[str, str]] = [] for fname in MODEL_FILES: if fname == "opus.jsonl": v35_signals.append(load_golden(V35_GOLDEN)) else: v35_signals.append(load_annotations(V35_BENCH, fname)) def get_signals(signals: list[dict[str, str]], pid: str) -> list[str | None]: """Get category from each model for a paragraph.""" return [s.get(pid) for s in signals] def majority_vote(signals: list[str | None], exclude_minimax: bool = True) -> str | None: """Compute majority from 6 models (excluding minimax which is index 6).""" cats = [] for i, s in enumerate(signals): if s is None: continue if exclude_minimax and MODEL_NAMES[i] in EXCLUDED_FROM_MAJORITY: continue cats.append(s) if not cats: return None counts = Counter(cats) return counts.most_common(1)[0][0] def unanimity_score(signals: list[str | None], exclude_minimax: bool = True) -> float: """Fraction of models agreeing with majority (0-1).""" cats = [] for i, s in enumerate(signals): if s is None: continue if exclude_minimax and MODEL_NAMES[i] in EXCLUDED_FROM_MAJORITY: continue cats.append(s) if not cats: return 0.0 counts = Counter(cats) top_count = counts.most_common(1)[0][1] return top_count / len(cats) def format_signals(signals: list[str | None]) -> str: """Compact model signal display.""" parts = [] for name, cat in zip(MODEL_NAMES, signals): if cat is None: parts.append(f"{name}=??") else: parts.append(f"{name}={abbrev(cat)}") return ", ".join(parts) def wrap_text(text: str, width: int = 100) -> str: return "\n ".join(textwrap.wrap(text, width=width)) def print_paragraph_analysis( pid: str, v30_sigs: list[str | None], v35_sigs: list[str | None], header: str = "", ): """Print detailed analysis for a single paragraph.""" text = para_text.get(pid, "[TEXT NOT FOUND]") h_labels = human_labels.get(pid, []) h_maj = human_majority(pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) axes = pid_axes.get(pid, []) if header: print(f"\n{'─' * 110}") print(f" {header}") print(f"{'─' * 110}") else: print(f"\n{'─' * 110}") print(f" PID: {pid}") print(f" Axes: {', '.join(axes)}") print(f"\n TEXT:") print(f" {wrap_text(text)}") print(f"\n HUMAN VOTES:") for name, cat in h_labels: marker = " ✓" if cat == h_maj else "" print(f" {name:12s} → {abbrev(cat):5s}{marker}") print(f" Majority → {abbrev(h_maj) if h_maj else '??'}") print(f"\n v3.0 signals: {format_signals(v30_sigs)}") print(f" v3.0 majority (excl. MiniMax): {abbrev(v30_maj) if v30_maj else '??'}") print(f" v3.5 signals: {format_signals(v35_sigs)}") print(f" v3.5 majority (excl. MiniMax): {abbrev(v35_maj) if v35_maj else '??'}") # What changed changed_models = [] for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)): if old is not None and new is not None and old != new: changed_models.append(f"{MODEL_NAMES[i]}: {abbrev(old)}→{abbrev(new)}") if changed_models: print(f"\n CHANGES: {', '.join(changed_models)}") correct_v30 = v30_maj == h_maj if v30_maj and h_maj else None correct_v35 = v35_maj == h_maj if v35_maj and h_maj else None print( f" v3.0 {'CORRECT' if correct_v30 else 'WRONG'} | " f"v3.5 {'CORRECT' if correct_v35 else 'WRONG'}" ) # ══════════════════════════════════════════════════════════════════════════════ # SECTION 1: BG↔MR Regression Cases # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "═" * 110) print(" SECTION 1: BG↔MR AXIS — REGRESSION CASES") print(" (v3.0 matched human majority, but v3.5 does NOT)") print("═" * 110) bg_mr_pids = [pid for pid, axes in pid_axes.items() if "BG_MR" in axes] print(f"\nTotal BG↔MR paragraphs: {len(bg_mr_pids)}") # Filter to those with human labels bg_mr_pids = [pid for pid in bg_mr_pids if human_majority(pid) is not None] print(f"With human labels: {len(bg_mr_pids)}") regressions_bg_mr = [] improvements_bg_mr = [] both_correct_bg_mr = [] both_wrong_bg_mr = [] for pid in bg_mr_pids: v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) h_maj = human_majority(pid) if v30_maj is None or v35_maj is None or h_maj is None: continue v30_correct = abbrev(v30_maj) == abbrev(h_maj) v35_correct = abbrev(v35_maj) == abbrev(h_maj) if v30_correct and not v35_correct: regressions_bg_mr.append(pid) elif not v30_correct and v35_correct: improvements_bg_mr.append(pid) elif v30_correct and v35_correct: both_correct_bg_mr.append(pid) else: both_wrong_bg_mr.append(pid) print(f"\nBG↔MR Summary:") print(f" Both correct: {len(both_correct_bg_mr)}") print(f" Both wrong: {len(both_wrong_bg_mr)}") print(f" v3.0 correct → v3.5 WRONG (REGRESSIONS): {len(regressions_bg_mr)}") print(f" v3.0 wrong → v3.5 correct (IMPROVEMENTS): {len(improvements_bg_mr)}") print(f"\n{'━' * 110}") print(f" BG↔MR REGRESSIONS (showing all, up to 20)") print(f"{'━' * 110}") for i, pid in enumerate(regressions_bg_mr[:20]): v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) print_paragraph_analysis(pid, v30_sigs, v35_sigs, f"REGRESSION #{i+1}") # BG↔MR improvements print(f"\n{'━' * 110}") print(f" BG↔MR IMPROVEMENTS (showing up to 5)") print(f"{'━' * 110}") for i, pid in enumerate(improvements_bg_mr[:5]): v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) print_paragraph_analysis(pid, v30_sigs, v35_sigs, f"IMPROVEMENT #{i+1}") # ══════════════════════════════════════════════════════════════════════════════ # SECTION 2: MR↔RMP Non-Convergence Cases # ══════════════════════════════════════════════════════════════════════════════ print("\n\n" + "═" * 110) print(" SECTION 2: MR↔RMP AXIS — NON-CONVERGENCE AND REGRESSIONS") print("═" * 110) mr_rmp_pids = [pid for pid, axes in pid_axes.items() if "MR_RMP" in axes] print(f"\nTotal MR↔RMP paragraphs: {len(mr_rmp_pids)}") mr_rmp_pids = [pid for pid in mr_rmp_pids if human_majority(pid) is not None] print(f"With human labels: {len(mr_rmp_pids)}") # Find: less unanimous in v3.5 OR flipped away from human majority non_convergence_mr_rmp = [] regressions_mr_rmp = [] improvements_mr_rmp = [] for pid in mr_rmp_pids: v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) h_maj = human_majority(pid) v30_unanimity = unanimity_score(v30_sigs) v35_unanimity = unanimity_score(v35_sigs) if v30_maj is None or v35_maj is None or h_maj is None: continue v30_correct = abbrev(v30_maj) == abbrev(h_maj) v35_correct = abbrev(v35_maj) == abbrev(h_maj) # Regression: was correct, now wrong if v30_correct and not v35_correct: regressions_mr_rmp.append((pid, v30_unanimity, v35_unanimity)) # Non-convergence: less unanimous OR flipped away if v35_unanimity < v30_unanimity or (v30_correct and not v35_correct): non_convergence_mr_rmp.append((pid, v30_unanimity, v35_unanimity)) if not v30_correct and v35_correct: improvements_mr_rmp.append((pid, v30_unanimity, v35_unanimity)) # Sort non-convergence by delta (worst first) non_convergence_mr_rmp.sort(key=lambda x: x[1] - x[2], reverse=True) print(f"\nMR↔RMP Summary:") print(f" Regressions (correct→wrong): {len(regressions_mr_rmp)}") print(f" Non-convergence (less unanimous or regressed): {len(non_convergence_mr_rmp)}") print(f" Improvements (wrong→correct): {len(improvements_mr_rmp)}") print(f"\n{'━' * 110}") print(f" MR↔RMP NON-CONVERGENCE / REGRESSION CASES (showing 10)") print(f"{'━' * 110}") shown = set() count = 0 for pid, v30_u, v35_u in non_convergence_mr_rmp: if count >= 10: break if pid in shown: continue shown.add(pid) v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) h_maj = human_majority(pid) label = "REGRESSION" if (abbrev(v30_maj) == abbrev(h_maj) and abbrev(v35_maj) != abbrev(h_maj)) else "LESS UNANIMOUS" print_paragraph_analysis( pid, v30_sigs, v35_sigs, f"{label} #{count+1} (unanimity: v3.0={v30_u:.0%} → v3.5={v35_u:.0%})" ) count += 1 print(f"\n{'━' * 110}") print(f" MR↔RMP IMPROVEMENTS (showing up to 5)") print(f"{'━' * 110}") for i, (pid, v30_u, v35_u) in enumerate(improvements_mr_rmp[:5]): v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) print_paragraph_analysis( pid, v30_sigs, v35_sigs, f"IMPROVEMENT #{i+1} (unanimity: v3.0={v30_u:.0%} → v3.5={v35_u:.0%})" ) # ══════════════════════════════════════════════════════════════════════════════ # SECTION 3: Error Pattern Analysis # ══════════════════════════════════════════════════════════════════════════════ print("\n\n" + "═" * 110) print(" SECTION 3: ERROR PATTERN ANALYSIS") print("═" * 110) # ── BG↔MR regression patterns ─────────────────────────────────────────────── print(f"\n{'━' * 110}") print(f" 3A: BG↔MR REGRESSION PATTERNS") print(f"{'━' * 110}") if regressions_bg_mr: # Analyze what the human majority is and what v3.5 switched to regression_directions = Counter() regression_model_flips = Counter() for pid in regressions_bg_mr: h_maj = human_majority(pid) v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) direction = f"{abbrev(v30_maj)}→{abbrev(v35_maj)} (human={abbrev(h_maj)})" regression_directions[direction] += 1 # Which models flipped? for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)): if old and new and old != new: regression_model_flips[MODEL_NAMES[i]] += 1 print(f"\n Regression directions (v3.0→v3.5, human ground truth):") for direction, count in regression_directions.most_common(): print(f" {direction}: {count}") print(f"\n Models that flipped most on regressions:") for model, count in regression_model_flips.most_common(): print(f" {model}: {count} flips") # Text pattern analysis print(f"\n Common textual signals in regression paragraphs:") signal_words = { "board": 0, "committee": 0, "oversee": 0, "oversight": 0, "report": 0, "director": 0, "officer": 0, "CISO": 0, "governance": 0, "responsible": 0, "qualif": 0, "experience": 0, "manage": 0, "program": 0, "framework": 0, "process": 0, "audit": 0, } for pid in regressions_bg_mr: text = para_text.get(pid, "").lower() for word in signal_words: if word.lower() in text: signal_words[word] += 1 total_reg = len(regressions_bg_mr) for word, count in sorted(signal_words.items(), key=lambda x: -x[1]): if count > 0: print(f" '{word}': {count}/{total_reg} ({count/total_reg:.0%})") # Check if humans are split on these print(f"\n Human agreement on regressions:") unanimous_human = 0 split_human = 0 for pid in regressions_bg_mr: labels = human_labels.get(pid, []) cats = [c for _, c in labels] if len(set(cats)) == 1: unanimous_human += 1 else: split_human += 1 print(f" Unanimous human: {unanimous_human}") print(f" Split human (2-1): {split_human}") if split_human > 0: print(f"\n Split-human regression details:") for pid in regressions_bg_mr: labels = human_labels.get(pid, []) cats = [c for _, c in labels] if len(set(cats)) > 1: votes = ", ".join(f"{n}={abbrev(c)}" for n, c in labels) print(f" {pid[:12]}... → {votes}") else: print("\n No BG↔MR regressions found.") # ── MR↔RMP patterns ───────────────────────────────────────────────────────── print(f"\n{'━' * 110}") print(f" 3B: MR↔RMP NON-CONVERGENCE PATTERNS") print(f"{'━' * 110}") if non_convergence_mr_rmp: # Regression directions nc_directions = Counter() nc_model_flips = Counter() for pid, _, _ in non_convergence_mr_rmp: h_maj = human_majority(pid) v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) direction = f"{abbrev(v30_maj)}→{abbrev(v35_maj)} (human={abbrev(h_maj)})" nc_directions[direction] += 1 for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)): if old and new and old != new: nc_model_flips[MODEL_NAMES[i]] += 1 print(f"\n Direction of non-convergent shifts:") for direction, count in nc_directions.most_common(): print(f" {direction}: {count}") print(f"\n Models that flipped most:") for model, count in nc_model_flips.most_common(): print(f" {model}: {count} flips") # Text pattern analysis — compare what helped vs what didn't print(f"\n Text signals in NON-CONVERGENT vs IMPROVED paragraphs:") keywords = ["CISO", "officer", "responsible", "oversee", "report", "program", "framework", "qualif", "experience", "certif", "manage", "assess", "monitor", "team", "director"] nc_pids_set = {pid for pid, _, _ in non_convergence_mr_rmp} imp_pids_set = {pid for pid, _, _ in improvements_mr_rmp} print(f"\n {'Keyword':<16} {'Non-conv':>10} {'Improved':>10}") print(f" {'─'*16} {'─'*10} {'─'*10}") for kw in keywords: nc_count = sum(1 for pid in nc_pids_set if kw.lower() in para_text.get(pid, "").lower()) imp_count = sum(1 for pid in imp_pids_set if kw.lower() in para_text.get(pid, "").lower()) nc_pct = f"{nc_count}/{len(nc_pids_set)}" if nc_pids_set else "0" imp_pct = f"{imp_count}/{len(imp_pids_set)}" if imp_pids_set else "0" print(f" {kw:<16} {nc_pct:>10} {imp_pct:>10}") # Person-removal test analysis print(f"\n Person-removal test applicability:") print(f" Checking if regression paragraphs have person as ONLY subject...") for pid, _, _ in regressions_mr_rmp: text = para_text.get(pid, "") has_person_subject = any( marker in text.lower() for marker in ["ciso", "chief information", "chief technology", "vice president", "director of", "officer"] ) has_process_subject = any( marker in text.lower() for marker in ["program", "framework", "process", "system", "controls", "policies", "procedures"] ) h_maj = human_majority(pid) v35_maj = majority_vote(get_signals(v35_signals, pid)) print( f" {pid[:12]}... person_subj={has_person_subject} " f"process_subj={has_process_subject} " f"human={abbrev(h_maj)} v3.5={abbrev(v35_maj)}" ) else: print("\n No MR↔RMP non-convergence cases found.") # ══════════════════════════════════════════════════════════════════════════════ # SECTION 4: Ruling Recommendations # ══════════════════════════════════════════════════════════════════════════════ print("\n\n" + "═" * 110) print(" SECTION 4: RULING RECOMMENDATIONS") print("═" * 110) print(""" Based on the error analysis above, here are the specific ruling observations: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4A: BG↔MR Board-Line Test ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ CURRENT RULING (Rule 2): "When a paragraph spans layers (governance chain paragraphs): apply the dominant-subject test — which layer occupies the most sentence-subjects?" "Governance overview spanning board → committee → officer → program → Board Governance if the board/committee occupies more sentence-subjects; Management Role if the officer does; Risk Management Process if the program does" """) # Analyze the specific regressions to give targeted advice if regressions_bg_mr: # Count what direction the regressions went bg_to_mr = sum( 1 for pid in regressions_bg_mr if abbrev(majority_vote(get_signals(v35_signals, pid))) == "MR" and abbrev(human_majority(pid)) == "BG" ) mr_to_bg = sum( 1 for pid in regressions_bg_mr if abbrev(majority_vote(get_signals(v35_signals, pid))) == "BG" and abbrev(human_majority(pid)) == "MR" ) other_dir = len(regressions_bg_mr) - bg_to_mr - mr_to_bg print(f" EMPIRICAL FINDING:") print(f" Regressions that moved BG→MR (human says BG): {bg_to_mr}") print(f" Regressions that moved MR→BG (human says MR): {mr_to_bg}") print(f" Other directions: {other_dir}") if bg_to_mr > mr_to_bg: print(""" DIAGNOSIS: The dominant-subject test is OVER-CORRECTING toward MR. When a governance chain mentions a CISO or officer, models are counting that mention as a "sentence subject" even when the paragraph's primary purpose is describing the board/committee oversight structure. PROPOSED FIX — add a "purpose test" before the subject count: "Before counting sentence-subjects, ask: what is the paragraph's PRIMARY COMMUNICATIVE PURPOSE? If it is to describe the oversight/reporting structure (who oversees whom, what gets reported where), the paragraph is Board Governance even if individual officers are named as intermediaries. The dominant-subject count applies only when the paragraph's purpose is genuinely ambiguous between describing the oversight structure and describing the officer's role." Alternatively, add a carve-out: "A governance chain paragraph (board → committee → officer → program) defaults to Board Governance unless the officer section constitutes MORE THAN HALF the paragraph's content AND includes qualifications, credentials, or personal background." """) elif mr_to_bg > bg_to_mr: print(""" DIAGNOSIS: The dominant-subject test is OVER-CORRECTING toward BG. Paragraphs that are primarily about management roles are being pulled toward BG because they mention board oversight. PROPOSED FIX: "When a paragraph's primary content is about a management role (CISO, CIO, etc.) and mentions board oversight only as context for the reporting relationship, classify as Management Role. Board Governance requires the board/committee to be the PRIMARY ACTOR, not merely the recipient of reports." """) print(""" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4B: MR↔RMP Three-Step Chain ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ CURRENT RULING (Rule 2b): "Step 1 — Subject test: What is the paragraph's grammatical subject? Step 2 — Person-removal test: Could you delete all named roles, titles, qualifications, experience descriptions, and credentials from the paragraph and still have a coherent cybersecurity disclosure? Step 3 — Qualifications tiebreaker: Does the paragraph include experience (years), certifications (CISSP, CISM), education, team size, or career history for named individuals?" """) if regressions_mr_rmp: mr_to_rmp = sum( 1 for pid, _, _ in regressions_mr_rmp if abbrev(majority_vote(get_signals(v35_signals, pid))) == "RMP" and abbrev(human_majority(pid)) == "MR" ) rmp_to_mr = sum( 1 for pid, _, _ in regressions_mr_rmp if abbrev(majority_vote(get_signals(v35_signals, pid))) == "MR" and abbrev(human_majority(pid)) == "RMP" ) print(f" EMPIRICAL FINDING:") print(f" Regressions that moved MR→RMP (human says MR): {mr_to_rmp}") print(f" Regressions that moved RMP→MR (human says RMP): {rmp_to_mr}") if mr_to_rmp > rmp_to_mr: print(""" DIAGNOSIS: The person-removal test is TOO AGGRESSIVE at removing people. When a paragraph describes a CISO's monitoring activities, the person-removal test says "yes, the monitoring process stands alone," but the HUMANS recognize that the paragraph is fundamentally about the management role's responsibilities. PROPOSED FIX — tighten the person-removal test: "Step 2 — Person-removal test: Delete all named roles AND their associated ACTIVITIES. If the paragraph still describes a cybersecurity process or framework, it is Risk Management Process. If deleting the roles and their activities leaves nothing substantive, it is Management Role. Key distinction: 'The CISO monitors threat intelligence' — removing the CISO removes the monitoring activity, so this is Management Role. 'The company monitors threat intelligence under the direction of the CISO' — removing the CISO leaves the monitoring intact, so this is RMP." """) elif rmp_to_mr > mr_to_rmp: print(""" DIAGNOSIS: The three-step chain is UNDER-APPLYING the person-removal test. Models are stopping at Step 1 (subject test) when they see a role title, without proceeding to the person-removal test. PROPOSED FIX: "Step 1 should only produce a STRONG signal, not a decisive result. Always proceed to Step 2 unless the paragraph is ENTIRELY about a person's credentials with no process content whatsoever." """) if not regressions_mr_rmp: print(""" No MR↔RMP regressions found. The three-step chain may be working correctly, or the non-convergence is increasing uncertainty without changing majority votes. Focus on whether the increased model disagreement reflects genuine ambiguity or whether the step instructions need to be more prescriptive. """) # ── Final summary stats ────────────────────────────────────────────────────── print("\n" + "═" * 110) print(" FINAL SUMMARY") print("═" * 110) # Overall accuracy comparison total_with_human = 0 v30_correct_total = 0 v35_correct_total = 0 for pid in all_pids: h_maj = human_majority(pid) if h_maj is None: continue v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) if v30_maj is None or v35_maj is None: continue total_with_human += 1 if abbrev(v30_maj) == abbrev(h_maj): v30_correct_total += 1 if abbrev(v35_maj) == abbrev(h_maj): v35_correct_total += 1 print(f"\n Overall accuracy on {total_with_human} confusion-axis paragraphs:") print(f" v3.0: {v30_correct_total}/{total_with_human} ({v30_correct_total/total_with_human:.1%})") print(f" v3.5: {v35_correct_total}/{total_with_human} ({v35_correct_total/total_with_human:.1%})") print(f" Delta: {v35_correct_total - v30_correct_total:+d}") # Per-axis breakdown for axis_name in ["BG_MR", "MR_RMP", "BG_RMP", "SI_NO"]: axis_pids = [pid for pid, axes in pid_axes.items() if axis_name in axes] v30_c = 0 v35_c = 0 n = 0 for pid in axis_pids: h_maj = human_majority(pid) if h_maj is None: continue v30_sigs = get_signals(v30_signals, pid) v35_sigs = get_signals(v35_signals, pid) v30_maj = majority_vote(v30_sigs) v35_maj = majority_vote(v35_sigs) if v30_maj is None or v35_maj is None: continue n += 1 if abbrev(v30_maj) == abbrev(h_maj): v30_c += 1 if abbrev(v35_maj) == abbrev(h_maj): v35_c += 1 if n > 0: print(f"\n {axis_name} ({n} paragraphs):") print(f" v3.0: {v30_c}/{n} ({v30_c/n:.1%})") print(f" v3.5: {v35_c}/{n} ({v35_c/n:.1%})") print(f" Delta: {v35_c - v30_c:+d}") print()