765 lines
30 KiB
Python
765 lines
30 KiB
Python
"""Examine specific paragraphs where v3.5 performed WORSE than v3.0 against human labels.
|
|
|
|
Focus on BG↔MR and MR↔RMP confusion axes.
|
|
"""
|
|
|
|
import json
|
|
import textwrap
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
# ── Paths ──────────────────────────────────────────────────────────────────────
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
V30_GOLDEN = ROOT / "data/annotations/golden/opus.jsonl"
|
|
V35_GOLDEN = ROOT / "data/annotations/golden-v35/opus.jsonl"
|
|
|
|
V30_BENCH = ROOT / "data/annotations/bench-holdout"
|
|
V35_BENCH = ROOT / "data/annotations/bench-holdout-v35"
|
|
|
|
HUMAN_LABELS = ROOT / "data/gold/human-labels-raw.jsonl"
|
|
HOLDOUT_META = ROOT / "data/gold/holdout-rerun-v35.jsonl"
|
|
PARAGRAPHS = ROOT / "data/gold/paragraphs-holdout.jsonl"
|
|
|
|
MODEL_FILES = [
|
|
"opus.jsonl",
|
|
"gpt-5.4.jsonl",
|
|
"gemini-3.1-pro-preview.jsonl",
|
|
"glm-5:exacto.jsonl",
|
|
"kimi-k2.5.jsonl",
|
|
"mimo-v2-pro:exacto.jsonl",
|
|
"minimax-m2.7:exacto.jsonl",
|
|
]
|
|
|
|
MODEL_NAMES = [
|
|
"Opus",
|
|
"GPT-5.4",
|
|
"Gemini",
|
|
"GLM-5",
|
|
"Kimi",
|
|
"Mimo",
|
|
"MiniMax",
|
|
]
|
|
|
|
# Models to EXCLUDE from majority calculation
|
|
EXCLUDED_FROM_MAJORITY = {"MiniMax"}
|
|
|
|
CAT_ABBREV = {
|
|
"BG": "Board Governance",
|
|
"MR": "Management Role",
|
|
"RMP": "Risk Management Process",
|
|
"SI": "Strategy Integration",
|
|
"NO": "None/Other",
|
|
"ID": "Incident Disclosure",
|
|
"TPR": "Third-Party Risk",
|
|
}
|
|
|
|
ABBREV_CAT = {v: k for k, v in CAT_ABBREV.items()}
|
|
|
|
|
|
def abbrev(cat: str) -> str:
|
|
return ABBREV_CAT.get(cat, cat)
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
with open(path) as f:
|
|
return [json.loads(line) for line in f if line.strip()]
|
|
|
|
|
|
def load_annotations(base_dir: Path, filename: str) -> dict[str, str]:
|
|
"""Load paragraphId → content_category mapping."""
|
|
path = base_dir / filename
|
|
records = load_jsonl(path)
|
|
return {r["paragraphId"]: r["label"]["content_category"] for r in records}
|
|
|
|
|
|
def load_golden(path: Path) -> dict[str, str]:
|
|
records = load_jsonl(path)
|
|
return {r["paragraphId"]: r["label"]["content_category"] for r in records}
|
|
|
|
|
|
# ── Load all data ─────────────────────────────────────────────────────────────
|
|
|
|
print("Loading data...")
|
|
|
|
# Confusion axis metadata
|
|
meta_records = load_jsonl(HOLDOUT_META)
|
|
pid_axes: dict[str, list[str]] = {r["paragraphId"]: r["axes"] for r in meta_records}
|
|
all_pids = set(pid_axes.keys())
|
|
|
|
# Human labels: paragraphId → list of (annotator, category)
|
|
human_raw = load_jsonl(HUMAN_LABELS)
|
|
human_labels: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
|
for r in human_raw:
|
|
if r["paragraphId"] in all_pids:
|
|
human_labels[r["paragraphId"]].append(
|
|
(r["annotatorName"], r["contentCategory"])
|
|
)
|
|
|
|
|
|
def human_majority(pid: str) -> str | None:
|
|
"""Return majority category from human annotators, or None if no data."""
|
|
labels = human_labels.get(pid)
|
|
if not labels:
|
|
return None
|
|
cats = [c for _, c in labels]
|
|
counts = Counter(cats)
|
|
top = counts.most_common(1)[0]
|
|
return top[0]
|
|
|
|
|
|
# Paragraph text
|
|
para_records = load_jsonl(PARAGRAPHS)
|
|
para_text: dict[str, str] = {r["id"]: r["text"] for r in para_records}
|
|
|
|
# v3.0 signals: model_idx → {pid: category}
|
|
v30_signals: list[dict[str, str]] = []
|
|
for fname in MODEL_FILES:
|
|
if fname == "opus.jsonl":
|
|
v30_signals.append(load_golden(V30_GOLDEN))
|
|
else:
|
|
v30_signals.append(load_annotations(V30_BENCH, fname))
|
|
|
|
# v3.5 signals
|
|
v35_signals: list[dict[str, str]] = []
|
|
for fname in MODEL_FILES:
|
|
if fname == "opus.jsonl":
|
|
v35_signals.append(load_golden(V35_GOLDEN))
|
|
else:
|
|
v35_signals.append(load_annotations(V35_BENCH, fname))
|
|
|
|
|
|
def get_signals(signals: list[dict[str, str]], pid: str) -> list[str | None]:
|
|
"""Get category from each model for a paragraph."""
|
|
return [s.get(pid) for s in signals]
|
|
|
|
|
|
def majority_vote(signals: list[str | None], exclude_minimax: bool = True) -> str | None:
|
|
"""Compute majority from 6 models (excluding minimax which is index 6)."""
|
|
cats = []
|
|
for i, s in enumerate(signals):
|
|
if s is None:
|
|
continue
|
|
if exclude_minimax and MODEL_NAMES[i] in EXCLUDED_FROM_MAJORITY:
|
|
continue
|
|
cats.append(s)
|
|
if not cats:
|
|
return None
|
|
counts = Counter(cats)
|
|
return counts.most_common(1)[0][0]
|
|
|
|
|
|
def unanimity_score(signals: list[str | None], exclude_minimax: bool = True) -> float:
|
|
"""Fraction of models agreeing with majority (0-1)."""
|
|
cats = []
|
|
for i, s in enumerate(signals):
|
|
if s is None:
|
|
continue
|
|
if exclude_minimax and MODEL_NAMES[i] in EXCLUDED_FROM_MAJORITY:
|
|
continue
|
|
cats.append(s)
|
|
if not cats:
|
|
return 0.0
|
|
counts = Counter(cats)
|
|
top_count = counts.most_common(1)[0][1]
|
|
return top_count / len(cats)
|
|
|
|
|
|
def format_signals(signals: list[str | None]) -> str:
|
|
"""Compact model signal display."""
|
|
parts = []
|
|
for name, cat in zip(MODEL_NAMES, signals):
|
|
if cat is None:
|
|
parts.append(f"{name}=??")
|
|
else:
|
|
parts.append(f"{name}={abbrev(cat)}")
|
|
return ", ".join(parts)
|
|
|
|
|
|
def wrap_text(text: str, width: int = 100) -> str:
|
|
return "\n ".join(textwrap.wrap(text, width=width))
|
|
|
|
|
|
def print_paragraph_analysis(
|
|
pid: str,
|
|
v30_sigs: list[str | None],
|
|
v35_sigs: list[str | None],
|
|
header: str = "",
|
|
):
|
|
"""Print detailed analysis for a single paragraph."""
|
|
text = para_text.get(pid, "[TEXT NOT FOUND]")
|
|
h_labels = human_labels.get(pid, [])
|
|
h_maj = human_majority(pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
axes = pid_axes.get(pid, [])
|
|
|
|
if header:
|
|
print(f"\n{'─' * 110}")
|
|
print(f" {header}")
|
|
print(f"{'─' * 110}")
|
|
else:
|
|
print(f"\n{'─' * 110}")
|
|
|
|
print(f" PID: {pid}")
|
|
print(f" Axes: {', '.join(axes)}")
|
|
print(f"\n TEXT:")
|
|
print(f" {wrap_text(text)}")
|
|
|
|
print(f"\n HUMAN VOTES:")
|
|
for name, cat in h_labels:
|
|
marker = " ✓" if cat == h_maj else ""
|
|
print(f" {name:12s} → {abbrev(cat):5s}{marker}")
|
|
print(f" Majority → {abbrev(h_maj) if h_maj else '??'}")
|
|
|
|
print(f"\n v3.0 signals: {format_signals(v30_sigs)}")
|
|
print(f" v3.0 majority (excl. MiniMax): {abbrev(v30_maj) if v30_maj else '??'}")
|
|
|
|
print(f" v3.5 signals: {format_signals(v35_sigs)}")
|
|
print(f" v3.5 majority (excl. MiniMax): {abbrev(v35_maj) if v35_maj else '??'}")
|
|
|
|
# What changed
|
|
changed_models = []
|
|
for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)):
|
|
if old is not None and new is not None and old != new:
|
|
changed_models.append(f"{MODEL_NAMES[i]}: {abbrev(old)}→{abbrev(new)}")
|
|
if changed_models:
|
|
print(f"\n CHANGES: {', '.join(changed_models)}")
|
|
|
|
correct_v30 = v30_maj == h_maj if v30_maj and h_maj else None
|
|
correct_v35 = v35_maj == h_maj if v35_maj and h_maj else None
|
|
print(
|
|
f" v3.0 {'CORRECT' if correct_v30 else 'WRONG'} | "
|
|
f"v3.5 {'CORRECT' if correct_v35 else 'WRONG'}"
|
|
)
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SECTION 1: BG↔MR Regression Cases
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "═" * 110)
|
|
print(" SECTION 1: BG↔MR AXIS — REGRESSION CASES")
|
|
print(" (v3.0 matched human majority, but v3.5 does NOT)")
|
|
print("═" * 110)
|
|
|
|
bg_mr_pids = [pid for pid, axes in pid_axes.items() if "BG_MR" in axes]
|
|
print(f"\nTotal BG↔MR paragraphs: {len(bg_mr_pids)}")
|
|
|
|
# Filter to those with human labels
|
|
bg_mr_pids = [pid for pid in bg_mr_pids if human_majority(pid) is not None]
|
|
print(f"With human labels: {len(bg_mr_pids)}")
|
|
|
|
regressions_bg_mr = []
|
|
improvements_bg_mr = []
|
|
both_correct_bg_mr = []
|
|
both_wrong_bg_mr = []
|
|
|
|
for pid in bg_mr_pids:
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
h_maj = human_majority(pid)
|
|
|
|
if v30_maj is None or v35_maj is None or h_maj is None:
|
|
continue
|
|
|
|
v30_correct = abbrev(v30_maj) == abbrev(h_maj)
|
|
v35_correct = abbrev(v35_maj) == abbrev(h_maj)
|
|
|
|
if v30_correct and not v35_correct:
|
|
regressions_bg_mr.append(pid)
|
|
elif not v30_correct and v35_correct:
|
|
improvements_bg_mr.append(pid)
|
|
elif v30_correct and v35_correct:
|
|
both_correct_bg_mr.append(pid)
|
|
else:
|
|
both_wrong_bg_mr.append(pid)
|
|
|
|
print(f"\nBG↔MR Summary:")
|
|
print(f" Both correct: {len(both_correct_bg_mr)}")
|
|
print(f" Both wrong: {len(both_wrong_bg_mr)}")
|
|
print(f" v3.0 correct → v3.5 WRONG (REGRESSIONS): {len(regressions_bg_mr)}")
|
|
print(f" v3.0 wrong → v3.5 correct (IMPROVEMENTS): {len(improvements_bg_mr)}")
|
|
|
|
print(f"\n{'━' * 110}")
|
|
print(f" BG↔MR REGRESSIONS (showing all, up to 20)")
|
|
print(f"{'━' * 110}")
|
|
|
|
for i, pid in enumerate(regressions_bg_mr[:20]):
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
print_paragraph_analysis(pid, v30_sigs, v35_sigs, f"REGRESSION #{i+1}")
|
|
|
|
# BG↔MR improvements
|
|
print(f"\n{'━' * 110}")
|
|
print(f" BG↔MR IMPROVEMENTS (showing up to 5)")
|
|
print(f"{'━' * 110}")
|
|
|
|
for i, pid in enumerate(improvements_bg_mr[:5]):
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
print_paragraph_analysis(pid, v30_sigs, v35_sigs, f"IMPROVEMENT #{i+1}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SECTION 2: MR↔RMP Non-Convergence Cases
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n\n" + "═" * 110)
|
|
print(" SECTION 2: MR↔RMP AXIS — NON-CONVERGENCE AND REGRESSIONS")
|
|
print("═" * 110)
|
|
|
|
mr_rmp_pids = [pid for pid, axes in pid_axes.items() if "MR_RMP" in axes]
|
|
print(f"\nTotal MR↔RMP paragraphs: {len(mr_rmp_pids)}")
|
|
mr_rmp_pids = [pid for pid in mr_rmp_pids if human_majority(pid) is not None]
|
|
print(f"With human labels: {len(mr_rmp_pids)}")
|
|
|
|
# Find: less unanimous in v3.5 OR flipped away from human majority
|
|
non_convergence_mr_rmp = []
|
|
regressions_mr_rmp = []
|
|
improvements_mr_rmp = []
|
|
|
|
for pid in mr_rmp_pids:
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
h_maj = human_majority(pid)
|
|
v30_unanimity = unanimity_score(v30_sigs)
|
|
v35_unanimity = unanimity_score(v35_sigs)
|
|
|
|
if v30_maj is None or v35_maj is None or h_maj is None:
|
|
continue
|
|
|
|
v30_correct = abbrev(v30_maj) == abbrev(h_maj)
|
|
v35_correct = abbrev(v35_maj) == abbrev(h_maj)
|
|
|
|
# Regression: was correct, now wrong
|
|
if v30_correct and not v35_correct:
|
|
regressions_mr_rmp.append((pid, v30_unanimity, v35_unanimity))
|
|
|
|
# Non-convergence: less unanimous OR flipped away
|
|
if v35_unanimity < v30_unanimity or (v30_correct and not v35_correct):
|
|
non_convergence_mr_rmp.append((pid, v30_unanimity, v35_unanimity))
|
|
|
|
if not v30_correct and v35_correct:
|
|
improvements_mr_rmp.append((pid, v30_unanimity, v35_unanimity))
|
|
|
|
# Sort non-convergence by delta (worst first)
|
|
non_convergence_mr_rmp.sort(key=lambda x: x[1] - x[2], reverse=True)
|
|
|
|
print(f"\nMR↔RMP Summary:")
|
|
print(f" Regressions (correct→wrong): {len(regressions_mr_rmp)}")
|
|
print(f" Non-convergence (less unanimous or regressed): {len(non_convergence_mr_rmp)}")
|
|
print(f" Improvements (wrong→correct): {len(improvements_mr_rmp)}")
|
|
|
|
print(f"\n{'━' * 110}")
|
|
print(f" MR↔RMP NON-CONVERGENCE / REGRESSION CASES (showing 10)")
|
|
print(f"{'━' * 110}")
|
|
|
|
shown = set()
|
|
count = 0
|
|
for pid, v30_u, v35_u in non_convergence_mr_rmp:
|
|
if count >= 10:
|
|
break
|
|
if pid in shown:
|
|
continue
|
|
shown.add(pid)
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
h_maj = human_majority(pid)
|
|
label = "REGRESSION" if (abbrev(v30_maj) == abbrev(h_maj) and abbrev(v35_maj) != abbrev(h_maj)) else "LESS UNANIMOUS"
|
|
print_paragraph_analysis(
|
|
pid, v30_sigs, v35_sigs,
|
|
f"{label} #{count+1} (unanimity: v3.0={v30_u:.0%} → v3.5={v35_u:.0%})"
|
|
)
|
|
count += 1
|
|
|
|
print(f"\n{'━' * 110}")
|
|
print(f" MR↔RMP IMPROVEMENTS (showing up to 5)")
|
|
print(f"{'━' * 110}")
|
|
|
|
for i, (pid, v30_u, v35_u) in enumerate(improvements_mr_rmp[:5]):
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
print_paragraph_analysis(
|
|
pid, v30_sigs, v35_sigs,
|
|
f"IMPROVEMENT #{i+1} (unanimity: v3.0={v30_u:.0%} → v3.5={v35_u:.0%})"
|
|
)
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SECTION 3: Error Pattern Analysis
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n\n" + "═" * 110)
|
|
print(" SECTION 3: ERROR PATTERN ANALYSIS")
|
|
print("═" * 110)
|
|
|
|
# ── BG↔MR regression patterns ───────────────────────────────────────────────
|
|
print(f"\n{'━' * 110}")
|
|
print(f" 3A: BG↔MR REGRESSION PATTERNS")
|
|
print(f"{'━' * 110}")
|
|
|
|
if regressions_bg_mr:
|
|
# Analyze what the human majority is and what v3.5 switched to
|
|
regression_directions = Counter()
|
|
regression_model_flips = Counter()
|
|
|
|
for pid in regressions_bg_mr:
|
|
h_maj = human_majority(pid)
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
direction = f"{abbrev(v30_maj)}→{abbrev(v35_maj)} (human={abbrev(h_maj)})"
|
|
regression_directions[direction] += 1
|
|
|
|
# Which models flipped?
|
|
for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)):
|
|
if old and new and old != new:
|
|
regression_model_flips[MODEL_NAMES[i]] += 1
|
|
|
|
print(f"\n Regression directions (v3.0→v3.5, human ground truth):")
|
|
for direction, count in regression_directions.most_common():
|
|
print(f" {direction}: {count}")
|
|
|
|
print(f"\n Models that flipped most on regressions:")
|
|
for model, count in regression_model_flips.most_common():
|
|
print(f" {model}: {count} flips")
|
|
|
|
# Text pattern analysis
|
|
print(f"\n Common textual signals in regression paragraphs:")
|
|
signal_words = {
|
|
"board": 0, "committee": 0, "oversee": 0, "oversight": 0,
|
|
"report": 0, "director": 0, "officer": 0, "CISO": 0,
|
|
"governance": 0, "responsible": 0, "qualif": 0, "experience": 0,
|
|
"manage": 0, "program": 0, "framework": 0, "process": 0,
|
|
"audit": 0,
|
|
}
|
|
for pid in regressions_bg_mr:
|
|
text = para_text.get(pid, "").lower()
|
|
for word in signal_words:
|
|
if word.lower() in text:
|
|
signal_words[word] += 1
|
|
|
|
total_reg = len(regressions_bg_mr)
|
|
for word, count in sorted(signal_words.items(), key=lambda x: -x[1]):
|
|
if count > 0:
|
|
print(f" '{word}': {count}/{total_reg} ({count/total_reg:.0%})")
|
|
|
|
# Check if humans are split on these
|
|
print(f"\n Human agreement on regressions:")
|
|
unanimous_human = 0
|
|
split_human = 0
|
|
for pid in regressions_bg_mr:
|
|
labels = human_labels.get(pid, [])
|
|
cats = [c for _, c in labels]
|
|
if len(set(cats)) == 1:
|
|
unanimous_human += 1
|
|
else:
|
|
split_human += 1
|
|
print(f" Unanimous human: {unanimous_human}")
|
|
print(f" Split human (2-1): {split_human}")
|
|
|
|
if split_human > 0:
|
|
print(f"\n Split-human regression details:")
|
|
for pid in regressions_bg_mr:
|
|
labels = human_labels.get(pid, [])
|
|
cats = [c for _, c in labels]
|
|
if len(set(cats)) > 1:
|
|
votes = ", ".join(f"{n}={abbrev(c)}" for n, c in labels)
|
|
print(f" {pid[:12]}... → {votes}")
|
|
else:
|
|
print("\n No BG↔MR regressions found.")
|
|
|
|
# ── MR↔RMP patterns ─────────────────────────────────────────────────────────
|
|
print(f"\n{'━' * 110}")
|
|
print(f" 3B: MR↔RMP NON-CONVERGENCE PATTERNS")
|
|
print(f"{'━' * 110}")
|
|
|
|
if non_convergence_mr_rmp:
|
|
# Regression directions
|
|
nc_directions = Counter()
|
|
nc_model_flips = Counter()
|
|
|
|
for pid, _, _ in non_convergence_mr_rmp:
|
|
h_maj = human_majority(pid)
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
direction = f"{abbrev(v30_maj)}→{abbrev(v35_maj)} (human={abbrev(h_maj)})"
|
|
nc_directions[direction] += 1
|
|
|
|
for i, (old, new) in enumerate(zip(v30_sigs, v35_sigs)):
|
|
if old and new and old != new:
|
|
nc_model_flips[MODEL_NAMES[i]] += 1
|
|
|
|
print(f"\n Direction of non-convergent shifts:")
|
|
for direction, count in nc_directions.most_common():
|
|
print(f" {direction}: {count}")
|
|
|
|
print(f"\n Models that flipped most:")
|
|
for model, count in nc_model_flips.most_common():
|
|
print(f" {model}: {count} flips")
|
|
|
|
# Text pattern analysis — compare what helped vs what didn't
|
|
print(f"\n Text signals in NON-CONVERGENT vs IMPROVED paragraphs:")
|
|
|
|
keywords = ["CISO", "officer", "responsible", "oversee", "report",
|
|
"program", "framework", "qualif", "experience", "certif",
|
|
"manage", "assess", "monitor", "team", "director"]
|
|
|
|
nc_pids_set = {pid for pid, _, _ in non_convergence_mr_rmp}
|
|
imp_pids_set = {pid for pid, _, _ in improvements_mr_rmp}
|
|
|
|
print(f"\n {'Keyword':<16} {'Non-conv':>10} {'Improved':>10}")
|
|
print(f" {'─'*16} {'─'*10} {'─'*10}")
|
|
for kw in keywords:
|
|
nc_count = sum(1 for pid in nc_pids_set if kw.lower() in para_text.get(pid, "").lower())
|
|
imp_count = sum(1 for pid in imp_pids_set if kw.lower() in para_text.get(pid, "").lower())
|
|
nc_pct = f"{nc_count}/{len(nc_pids_set)}" if nc_pids_set else "0"
|
|
imp_pct = f"{imp_count}/{len(imp_pids_set)}" if imp_pids_set else "0"
|
|
print(f" {kw:<16} {nc_pct:>10} {imp_pct:>10}")
|
|
|
|
# Person-removal test analysis
|
|
print(f"\n Person-removal test applicability:")
|
|
print(f" Checking if regression paragraphs have person as ONLY subject...")
|
|
for pid, _, _ in regressions_mr_rmp:
|
|
text = para_text.get(pid, "")
|
|
has_person_subject = any(
|
|
marker in text.lower()
|
|
for marker in ["ciso", "chief information", "chief technology",
|
|
"vice president", "director of", "officer"]
|
|
)
|
|
has_process_subject = any(
|
|
marker in text.lower()
|
|
for marker in ["program", "framework", "process", "system",
|
|
"controls", "policies", "procedures"]
|
|
)
|
|
h_maj = human_majority(pid)
|
|
v35_maj = majority_vote(get_signals(v35_signals, pid))
|
|
print(
|
|
f" {pid[:12]}... person_subj={has_person_subject} "
|
|
f"process_subj={has_process_subject} "
|
|
f"human={abbrev(h_maj)} v3.5={abbrev(v35_maj)}"
|
|
)
|
|
else:
|
|
print("\n No MR↔RMP non-convergence cases found.")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SECTION 4: Ruling Recommendations
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n\n" + "═" * 110)
|
|
print(" SECTION 4: RULING RECOMMENDATIONS")
|
|
print("═" * 110)
|
|
|
|
print("""
|
|
Based on the error analysis above, here are the specific ruling observations:
|
|
|
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
4A: BG↔MR Board-Line Test
|
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
|
|
CURRENT RULING (Rule 2):
|
|
"When a paragraph spans layers (governance chain paragraphs): apply the
|
|
dominant-subject test — which layer occupies the most sentence-subjects?"
|
|
|
|
"Governance overview spanning board → committee → officer → program →
|
|
Board Governance if the board/committee occupies more sentence-subjects;
|
|
Management Role if the officer does; Risk Management Process if the
|
|
program does"
|
|
""")
|
|
|
|
# Analyze the specific regressions to give targeted advice
|
|
if regressions_bg_mr:
|
|
# Count what direction the regressions went
|
|
bg_to_mr = sum(
|
|
1 for pid in regressions_bg_mr
|
|
if abbrev(majority_vote(get_signals(v35_signals, pid))) == "MR"
|
|
and abbrev(human_majority(pid)) == "BG"
|
|
)
|
|
mr_to_bg = sum(
|
|
1 for pid in regressions_bg_mr
|
|
if abbrev(majority_vote(get_signals(v35_signals, pid))) == "BG"
|
|
and abbrev(human_majority(pid)) == "MR"
|
|
)
|
|
other_dir = len(regressions_bg_mr) - bg_to_mr - mr_to_bg
|
|
|
|
print(f" EMPIRICAL FINDING:")
|
|
print(f" Regressions that moved BG→MR (human says BG): {bg_to_mr}")
|
|
print(f" Regressions that moved MR→BG (human says MR): {mr_to_bg}")
|
|
print(f" Other directions: {other_dir}")
|
|
|
|
if bg_to_mr > mr_to_bg:
|
|
print("""
|
|
DIAGNOSIS: The dominant-subject test is OVER-CORRECTING toward MR.
|
|
When a governance chain mentions a CISO or officer, models are counting that
|
|
mention as a "sentence subject" even when the paragraph's primary purpose is
|
|
describing the board/committee oversight structure.
|
|
|
|
PROPOSED FIX — add a "purpose test" before the subject count:
|
|
"Before counting sentence-subjects, ask: what is the paragraph's PRIMARY
|
|
COMMUNICATIVE PURPOSE? If it is to describe the oversight/reporting
|
|
structure (who oversees whom, what gets reported where), the paragraph
|
|
is Board Governance even if individual officers are named as intermediaries.
|
|
The dominant-subject count applies only when the paragraph's purpose is
|
|
genuinely ambiguous between describing the oversight structure and
|
|
describing the officer's role."
|
|
|
|
Alternatively, add a carve-out:
|
|
"A governance chain paragraph (board → committee → officer → program)
|
|
defaults to Board Governance unless the officer section constitutes
|
|
MORE THAN HALF the paragraph's content AND includes qualifications,
|
|
credentials, or personal background."
|
|
""")
|
|
elif mr_to_bg > bg_to_mr:
|
|
print("""
|
|
DIAGNOSIS: The dominant-subject test is OVER-CORRECTING toward BG.
|
|
Paragraphs that are primarily about management roles are being pulled
|
|
toward BG because they mention board oversight.
|
|
|
|
PROPOSED FIX:
|
|
"When a paragraph's primary content is about a management role (CISO,
|
|
CIO, etc.) and mentions board oversight only as context for the
|
|
reporting relationship, classify as Management Role. Board Governance
|
|
requires the board/committee to be the PRIMARY ACTOR, not merely
|
|
the recipient of reports."
|
|
""")
|
|
|
|
print("""
|
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
4B: MR↔RMP Three-Step Chain
|
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
|
|
CURRENT RULING (Rule 2b):
|
|
"Step 1 — Subject test: What is the paragraph's grammatical subject?
|
|
Step 2 — Person-removal test: Could you delete all named roles, titles,
|
|
qualifications, experience descriptions, and credentials from the
|
|
paragraph and still have a coherent cybersecurity disclosure?
|
|
Step 3 — Qualifications tiebreaker: Does the paragraph include experience
|
|
(years), certifications (CISSP, CISM), education, team size, or career
|
|
history for named individuals?"
|
|
""")
|
|
|
|
if regressions_mr_rmp:
|
|
mr_to_rmp = sum(
|
|
1 for pid, _, _ in regressions_mr_rmp
|
|
if abbrev(majority_vote(get_signals(v35_signals, pid))) == "RMP"
|
|
and abbrev(human_majority(pid)) == "MR"
|
|
)
|
|
rmp_to_mr = sum(
|
|
1 for pid, _, _ in regressions_mr_rmp
|
|
if abbrev(majority_vote(get_signals(v35_signals, pid))) == "MR"
|
|
and abbrev(human_majority(pid)) == "RMP"
|
|
)
|
|
|
|
print(f" EMPIRICAL FINDING:")
|
|
print(f" Regressions that moved MR→RMP (human says MR): {mr_to_rmp}")
|
|
print(f" Regressions that moved RMP→MR (human says RMP): {rmp_to_mr}")
|
|
|
|
if mr_to_rmp > rmp_to_mr:
|
|
print("""
|
|
DIAGNOSIS: The person-removal test is TOO AGGRESSIVE at removing people.
|
|
When a paragraph describes a CISO's monitoring activities, the person-removal
|
|
test says "yes, the monitoring process stands alone," but the HUMANS recognize
|
|
that the paragraph is fundamentally about the management role's responsibilities.
|
|
|
|
PROPOSED FIX — tighten the person-removal test:
|
|
"Step 2 — Person-removal test: Delete all named roles AND their associated
|
|
ACTIVITIES. If the paragraph still describes a cybersecurity process or
|
|
framework, it is Risk Management Process. If deleting the roles and their
|
|
activities leaves nothing substantive, it is Management Role.
|
|
Key distinction: 'The CISO monitors threat intelligence' — removing the
|
|
CISO removes the monitoring activity, so this is Management Role.
|
|
'The company monitors threat intelligence under the direction of the CISO'
|
|
— removing the CISO leaves the monitoring intact, so this is RMP."
|
|
""")
|
|
elif rmp_to_mr > mr_to_rmp:
|
|
print("""
|
|
DIAGNOSIS: The three-step chain is UNDER-APPLYING the person-removal test.
|
|
Models are stopping at Step 1 (subject test) when they see a role title,
|
|
without proceeding to the person-removal test.
|
|
|
|
PROPOSED FIX:
|
|
"Step 1 should only produce a STRONG signal, not a decisive result.
|
|
Always proceed to Step 2 unless the paragraph is ENTIRELY about
|
|
a person's credentials with no process content whatsoever."
|
|
""")
|
|
|
|
if not regressions_mr_rmp:
|
|
print("""
|
|
No MR↔RMP regressions found. The three-step chain may be working correctly,
|
|
or the non-convergence is increasing uncertainty without changing majority votes.
|
|
Focus on whether the increased model disagreement reflects genuine ambiguity
|
|
or whether the step instructions need to be more prescriptive.
|
|
""")
|
|
|
|
# ── Final summary stats ──────────────────────────────────────────────────────
|
|
print("\n" + "═" * 110)
|
|
print(" FINAL SUMMARY")
|
|
print("═" * 110)
|
|
|
|
# Overall accuracy comparison
|
|
total_with_human = 0
|
|
v30_correct_total = 0
|
|
v35_correct_total = 0
|
|
|
|
for pid in all_pids:
|
|
h_maj = human_majority(pid)
|
|
if h_maj is None:
|
|
continue
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
if v30_maj is None or v35_maj is None:
|
|
continue
|
|
total_with_human += 1
|
|
if abbrev(v30_maj) == abbrev(h_maj):
|
|
v30_correct_total += 1
|
|
if abbrev(v35_maj) == abbrev(h_maj):
|
|
v35_correct_total += 1
|
|
|
|
print(f"\n Overall accuracy on {total_with_human} confusion-axis paragraphs:")
|
|
print(f" v3.0: {v30_correct_total}/{total_with_human} ({v30_correct_total/total_with_human:.1%})")
|
|
print(f" v3.5: {v35_correct_total}/{total_with_human} ({v35_correct_total/total_with_human:.1%})")
|
|
print(f" Delta: {v35_correct_total - v30_correct_total:+d}")
|
|
|
|
# Per-axis breakdown
|
|
for axis_name in ["BG_MR", "MR_RMP", "BG_RMP", "SI_NO"]:
|
|
axis_pids = [pid for pid, axes in pid_axes.items() if axis_name in axes]
|
|
v30_c = 0
|
|
v35_c = 0
|
|
n = 0
|
|
for pid in axis_pids:
|
|
h_maj = human_majority(pid)
|
|
if h_maj is None:
|
|
continue
|
|
v30_sigs = get_signals(v30_signals, pid)
|
|
v35_sigs = get_signals(v35_signals, pid)
|
|
v30_maj = majority_vote(v30_sigs)
|
|
v35_maj = majority_vote(v35_sigs)
|
|
if v30_maj is None or v35_maj is None:
|
|
continue
|
|
n += 1
|
|
if abbrev(v30_maj) == abbrev(h_maj):
|
|
v30_c += 1
|
|
if abbrev(v35_maj) == abbrev(h_maj):
|
|
v35_c += 1
|
|
|
|
if n > 0:
|
|
print(f"\n {axis_name} ({n} paragraphs):")
|
|
print(f" v3.0: {v30_c}/{n} ({v30_c/n:.1%})")
|
|
print(f" v3.5: {v35_c}/{n} ({v35_c/n:.1%})")
|
|
print(f" Delta: {v35_c - v30_c:+d}")
|
|
|
|
print()
|