SEC-cyBERT/scripts/audit-stage1-labels.py
2026-04-03 14:43:53 -04:00

621 lines
28 KiB
Python

"""
Audit Stage 1 annotations for systematic SI↔N/O miscoding.
Stage 1 used prompt v2.5 which lacked the rule "materiality disclaimers → SI."
This script quantifies how many N/O labels likely should have been SI, plus
other potential miscoding axes.
Run: uv run --with numpy scripts/audit-stage1-labels.py
"""
import json
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path
# ── Paths ──────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent
ANNOTATIONS = ROOT / "data" / "annotations" / "stage1.patched.jsonl"
PARAGRAPHS = ROOT / "data" / "paragraphs" / "paragraphs-clean.patched.jsonl"
PARAGRAPHS_FALLBACK = ROOT / "data" / "paragraphs" / "paragraphs-clean.jsonl"
HOLDOUT = ROOT / "data" / "gold" / "paragraphs-holdout.jsonl"
HUMAN_LABELS = ROOT / "data" / "gold" / "human-labels-raw.jsonl"
# ── Materiality regex patterns ─────────────────────────────────────────
# Pattern 1: "material" near business/strategy language (within ~15 words)
PAT_MATERIAL_NEAR_BIZ = re.compile(
r"material(?:ly)?\b.{0,100}\b(?:business\s+strategy|results\s+of\s+operations|financial\s+condition|business|operations)"
r"|"
r"(?:business\s+strategy|results\s+of\s+operations|financial\s+condition)\b.{0,100}\baterial(?:ly)?",
re.IGNORECASE,
)
# Pattern 2: specific materiality disclaimer phrases
PAT_MATERIALITY_DISCLAIMER = re.compile(
r"have\s+not\s+materially\s+affected"
r"|has\s+not\s+materially\s+affected"
r"|could\s+materially\s+affect"
r"|could\s+have\s+a\s+material\s+(?:adverse\s+)?(?:effect|impact)"
r"|may\s+(?:materially|have\s+a\s+material)\s+(?:adverse\s+)?(?:effect|impact|affect)"
r"|reasonably\s+likely\s+to\s+materially\s+affect"
r"|not\s+reasonably\s+likely"
r"|materially\s+(?:adverse(?:ly)?|impact|affect)"
r"|material\s+adverse\s+(?:effect|impact)"
r"|no\s+material\s+(?:adverse\s+)?(?:effect|impact)"
r"|did\s+not\s+(?:have\s+a\s+)?material(?:ly)?\s+(?:adverse\s+)?(?:effect|impact|affect)",
re.IGNORECASE,
)
# Pattern 3: explicit SI-relevant phrases
PAT_SI_PHRASES = re.compile(
r"business\s+strategy"
r"|results\s+of\s+operations"
r"|financial\s+condition"
r"|integrated\s+(?:into|with)\s+(?:our\s+)?(?:overall|business)"
r"|part\s+of\s+(?:our\s+)?(?:overall|broader)\s+(?:risk|enterprise|business)",
re.IGNORECASE,
)
def has_materiality_language(text: str) -> bool:
"""Returns True if text contains materiality-related language indicative of SI."""
return bool(
PAT_MATERIALITY_DISCLAIMER.search(text)
or PAT_SI_PHRASES.search(text)
or PAT_MATERIAL_NEAR_BIZ.search(text)
)
# ── Insurance / budget / incident patterns ─────────────────────────────
PAT_INSURANCE = re.compile(r"\binsurance\b", re.IGNORECASE)
PAT_BUDGET = re.compile(r"\b(?:budget|investment(?:s)?)\b", re.IGNORECASE)
PAT_INCIDENT = re.compile(
r"\bwe\s+(?:experienced|suffered|detected|identified|discovered|encountered|were\s+subject\s+to)\b",
re.IGNORECASE,
)
# ── Cross-category confusion patterns ──────────────────────────────────
PAT_PROGRAM_FRAMEWORK = re.compile(
r"\b(?:program|framework|process(?:es)?|procedure(?:s)?)\b", re.IGNORECASE
)
PAT_TITLE = re.compile(
r"\b(?:Chief\s+(?:Information|Technology|Executive|Financial|Security|Operating|Risk)\s+(?:Officer|Security\s+Officer))"
r"|(?:CISO|CIO|CTO|CFO|CEO|COO|CRO)\b"
r"|\b(?:Vice\s+President|Director|Senior\s+Vice\s+President|EVP|SVP)\b",
re.IGNORECASE,
)
PAT_MANAGEMENT_OFFICERS = re.compile(
r"\b(?:management|officer(?:s)?|executive(?:s)?|leader(?:s)?(?:hip)?)\b",
re.IGNORECASE,
)
def separator(title: str) -> None:
width = 80
print()
print("=" * width)
print(f" {title}")
print("=" * width)
def print_example(idx: int, pid: str, text: str, extra: str = "") -> None:
print(f"\n [{idx}] paragraphId: {pid}")
if extra:
print(f" {extra}")
# Wrap text at ~100 chars for readability
wrapped = text
if len(wrapped) > 500:
wrapped = wrapped[:500] + "..."
print(f" TEXT: {wrapped}")
# ── Load data ──────────────────────────────────────────────────────────
def load_annotations() -> dict[str, list[dict]]:
"""Returns {paragraphId: [annotation, ...]}"""
by_para: dict[str, list[dict]] = defaultdict(list)
with open(ANNOTATIONS) as f:
for line in f:
d = json.loads(line)
pid = d["paragraphId"]
cat = d["label"]["content_category"]
model = d["provenance"]["modelId"]
by_para[pid].append({"category": cat, "model": model})
return dict(by_para)
def load_paragraphs() -> dict[str, str]:
"""Returns {paragraphId: text}"""
texts: dict[str, str] = {}
path = PARAGRAPHS if PARAGRAPHS.exists() else PARAGRAPHS_FALLBACK
with open(path) as f:
for line in f:
d = json.loads(line)
texts[d["id"]] = d["text"]
return texts
def load_holdout() -> dict[str, dict]:
"""Returns {paragraphId: {text, stage1Category, stage1Method, ...}}"""
holdout: dict[str, dict] = {}
with open(HOLDOUT) as f:
for line in f:
d = json.loads(line)
holdout[d["id"]] = d
return holdout
def load_human_labels() -> dict[str, list[dict]]:
"""Returns {paragraphId: [{annotatorName, contentCategory}, ...]}"""
labels: dict[str, list[dict]] = defaultdict(list)
with open(HUMAN_LABELS) as f:
for line in f:
d = json.loads(line)
labels[d["paragraphId"]].append(
{
"annotator": d["annotatorName"],
"category": d["contentCategory"],
"specificity": d["specificityLevel"],
}
)
return dict(labels)
def main() -> None:
print("Loading data...")
annotations = load_annotations()
texts = load_paragraphs()
holdout = load_holdout()
human_labels = load_human_labels()
print(f" Annotations: {sum(len(v) for v in annotations.values())} across {len(annotations)} paragraphs")
print(f" Paragraph texts loaded: {len(texts)}")
print(f" Holdout paragraphs: {len(holdout)}")
print(f" Human-labeled paragraphs: {len(human_labels)}")
# ── Classify each paragraph by voting ──────────────────────────────
unanimous_no: list[str] = []
majority_no: list[str] = [] # 2/3 N/O
unanimous_si: list[str] = []
unanimous_mr: list[str] = []
unanimous_rmp: list[str] = []
unanimous_bg: list[str] = []
all_unanimous: dict[str, str] = {} # pid -> category for unanimous
for pid, anns in annotations.items():
cats = [a["category"] for a in anns]
cat_counts = Counter(cats)
if len(cats) != 3:
continue # skip incomplete
if cat_counts.get("None/Other", 0) == 3:
unanimous_no.append(pid)
all_unanimous[pid] = "None/Other"
elif cat_counts.get("None/Other", 0) == 2:
majority_no.append(pid)
elif cat_counts.get("Strategy Integration", 0) == 3:
unanimous_si.append(pid)
all_unanimous[pid] = "Strategy Integration"
elif cat_counts.get("Management Role", 0) == 3:
unanimous_mr.append(pid)
all_unanimous[pid] = "Management Role"
elif cat_counts.get("Risk Management Process", 0) == 3:
unanimous_rmp.append(pid)
all_unanimous[pid] = "Risk Management Process"
elif cat_counts.get("Board Governance", 0) == 3:
unanimous_bg.append(pid)
all_unanimous[pid] = "Board Governance"
# Track all unanimous
if len(cat_counts) == 1:
all_unanimous[pid] = cats[0]
print(f"\n Unanimous N/O: {len(unanimous_no)}")
print(f" Majority N/O (2/3): {len(majority_no)}")
print(f" Unanimous SI: {len(unanimous_si)}")
print(f" Unanimous MR: {len(unanimous_mr)}")
print(f" Unanimous RMP: {len(unanimous_rmp)}")
print(f" Unanimous BG: {len(unanimous_bg)}")
print(f" Total unanimous (any): {len(all_unanimous)}")
# ════════════════════════════════════════════════════════════════════
# 1. Unanimous N/O with materiality language
# ════════════════════════════════════════════════════════════════════
separator("1. UNANIMOUS N/O WITH MATERIALITY LANGUAGE")
no_with_mat: list[tuple[str, str]] = []
no_without_text = 0
for pid in unanimous_no:
text = texts.get(pid)
if text is None:
no_without_text += 1
continue
if has_materiality_language(text):
no_with_mat.append((pid, text))
print(f"\n Total unanimous N/O: {len(unanimous_no)}")
print(f" Missing text: {no_without_text}")
print(f" With materiality language: {len(no_with_mat)}")
print(f" Percentage of unanimous N/O: {len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}%")
print(f"\n --- 10 representative examples ---")
# Pick a diverse sample: take every Nth
step = max(1, len(no_with_mat) // 10)
shown = 0
for i in range(0, len(no_with_mat), step):
if shown >= 10:
break
pid, text = no_with_mat[i]
print_example(shown + 1, pid, text)
shown += 1
# ════════════════════════════════════════════════════════════════════
# 2. Majority N/O with materiality language
# ════════════════════════════════════════════════════════════════════
separator("2. MAJORITY N/O (2/3) WITH MATERIALITY LANGUAGE")
maj_no_with_mat: list[tuple[str, str, str, str]] = [] # pid, text, dissenting_model, dissenting_cat
for pid in majority_no:
text = texts.get(pid)
if text is None:
continue
if has_materiality_language(text):
anns = annotations[pid]
for a in anns:
if a["category"] != "None/Other":
maj_no_with_mat.append((pid, text, a["model"], a["category"]))
break
print(f"\n Total majority N/O (2/3): {len(majority_no)}")
print(f" With materiality language: {len(maj_no_with_mat)}")
print(f" Percentage: {len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}%")
# Count dissenting categories
dissent_cats = Counter(x[3] for x in maj_no_with_mat)
print(f"\n Dissenting model voted:")
for cat, cnt in dissent_cats.most_common():
print(f" {cat}: {cnt}")
# Count dissenting models
dissent_models = Counter(x[2] for x in maj_no_with_mat)
print(f"\n Which models dissented:")
for model, cnt in dissent_models.most_common():
print(f" {model}: {cnt}")
print(f"\n --- 5 examples ---")
step = max(1, len(maj_no_with_mat) // 5)
shown = 0
for i in range(0, len(maj_no_with_mat), step):
if shown >= 5:
break
pid, text, model, cat = maj_no_with_mat[i]
print_example(shown + 1, pid, text, f"Dissent: {model}{cat}")
shown += 1
# ════════════════════════════════════════════════════════════════════
# 3. Unanimous SI examples (contrast)
# ════════════════════════════════════════════════════════════════════
separator("3. UNANIMOUS SI — WHAT CLEAN SI LOOKS LIKE")
si_examples: list[tuple[str, str]] = []
for pid in unanimous_si:
text = texts.get(pid)
if text:
si_examples.append((pid, text))
if len(si_examples) >= 20:
break
print(f"\n Total unanimous SI: {len(unanimous_si)}")
print(f"\n --- 5 examples ---")
for i, (pid, text) in enumerate(si_examples[:5]):
print_example(i + 1, pid, text)
# Analyze SI language patterns
si_has_materiality = sum(1 for pid in unanimous_si if pid in texts and has_materiality_language(texts[pid]))
si_has_insurance = sum(1 for pid in unanimous_si if pid in texts and PAT_INSURANCE.search(texts[pid]))
si_has_budget = sum(1 for pid in unanimous_si if pid in texts and PAT_BUDGET.search(texts[pid]))
print(f"\n SI language patterns:")
print(f" With materiality language: {si_has_materiality} / {len(unanimous_si)} ({si_has_materiality / max(1, len(unanimous_si)) * 100:.1f}%)")
print(f" Mention insurance: {si_has_insurance} / {len(unanimous_si)}")
print(f" Mention budget/investment: {si_has_budget} / {len(unanimous_si)}")
# ════════════════════════════════════════════════════════════════════
# 4. N/O with other potential miscoding
# ════════════════════════════════════════════════════════════════════
separator("4. N/O PARAGRAPHS WITH OTHER POTENTIAL MISCODING")
no_insurance: list[tuple[str, str]] = []
no_budget: list[tuple[str, str]] = []
no_incident: list[tuple[str, str]] = []
for pid in unanimous_no:
text = texts.get(pid)
if text is None:
continue
if PAT_INSURANCE.search(text):
no_insurance.append((pid, text))
if PAT_BUDGET.search(text):
no_budget.append((pid, text))
if PAT_INCIDENT.search(text):
no_incident.append((pid, text))
print(f"\n Unanimous N/O mentioning insurance: {len(no_insurance)}")
print(f" Unanimous N/O mentioning budget/investment: {len(no_budget)}")
print(f" Unanimous N/O mentioning incidents ('we experienced...'): {len(no_incident)}")
# Show examples for each
print(f"\n --- Insurance examples (up to 3) ---")
for i, (pid, text) in enumerate(no_insurance[:3]):
print_example(i + 1, pid, text)
print(f"\n --- Budget/investment examples (up to 3) ---")
for i, (pid, text) in enumerate(no_budget[:3]):
print_example(i + 1, pid, text)
print(f"\n --- Incident examples (up to 3) ---")
for i, (pid, text) in enumerate(no_incident[:3]):
print_example(i + 1, pid, text)
# ════════════════════════════════════════════════════════════════════
# 5. Scale the problem
# ════════════════════════════════════════════════════════════════════
separator("5. SCALE THE PROBLEM")
# Deduplicate: some paragraphs may hit multiple patterns
no_any_miscoded = set()
for pid, _ in no_with_mat:
no_any_miscoded.add(pid)
for pid, _ in no_insurance:
no_any_miscoded.add(pid)
for pid, _ in no_budget:
no_any_miscoded.add(pid)
no_incident_pids = set(pid for pid, _ in no_incident)
# Materiality-only (not already insurance/budget)
mat_only = set(pid for pid, _ in no_with_mat)
ins_only = set(pid for pid, _ in no_insurance) - mat_only
bud_only = set(pid for pid, _ in no_budget) - mat_only - ins_only
total_unanimous = len(all_unanimous)
total_annotations = len(annotations)
print(f"\n Total paragraphs with 3 annotations: {total_annotations}")
print(f" Total unanimous (any category): {total_unanimous}")
print(f" Total unanimous N/O: {len(unanimous_no)}")
print()
print(f" Potentially miscoded unanimous N/O:")
print(f" Materiality language (likely SI): {len(no_with_mat)}")
print(f" Insurance (likely SI): {len(no_insurance)}")
print(f" Budget/investment (likely SI): {len(no_budget)}")
print(f" Incident language (likely SI or ID): {len(no_incident)}")
print(f" Any of above (deduplicated): {len(no_any_miscoded)}")
print(f" Incident (separate concern): {len(no_incident_pids)}")
print()
# Overlap analysis
mat_set = set(pid for pid, _ in no_with_mat)
ins_set = set(pid for pid, _ in no_insurance)
bud_set = set(pid for pid, _ in no_budget)
print(f" Overlap analysis:")
print(f" Materiality ∩ Insurance: {len(mat_set & ins_set)}")
print(f" Materiality ∩ Budget: {len(mat_set & bud_set)}")
print(f" Insurance ∩ Budget: {len(ins_set & bud_set)}")
print()
pct_no_affected = len(no_any_miscoded) / max(1, len(unanimous_no)) * 100
pct_total_affected = len(no_any_miscoded) / max(1, total_unanimous) * 100
pct_all_affected = len(no_any_miscoded) / max(1, total_annotations) * 100
print(f" Impact estimates:")
print(f" % of unanimous N/O potentially miscoded: {pct_no_affected:.1f}%")
print(f" % of all unanimous labels affected: {pct_total_affected:.1f}%")
print(f" % of all paragraphs affected: {pct_all_affected:.1f}%")
# Also check majority N/O
maj_no_any = set()
for pid in majority_no:
text = texts.get(pid)
if text is None:
continue
if has_materiality_language(text) or PAT_INSURANCE.search(text) or PAT_BUDGET.search(text):
maj_no_any.add(pid)
print(f"\n Majority N/O (2/3) potentially miscoded: {len(maj_no_any)} / {len(majority_no)}")
print(f" Combined (unanimous + majority) potentially miscoded N/O: {len(no_any_miscoded) + len(maj_no_any)}")
# ════════════════════════════════════════════════════════════════════
# 6. Cross-check with holdout / human labels
# ════════════════════════════════════════════════════════════════════
separator("6. HOLDOUT CROSS-CHECK WITH HUMAN LABELS")
# Find holdout paragraphs that Stage 1 unanimously called N/O but contain materiality language
holdout_no_mat: list[tuple[str, str]] = []
holdout_no_mat_with_human: list[tuple[str, str, list[dict]]] = []
for pid, para in holdout.items():
if para.get("stage1Category") == "None/Other" and para.get("stage1Method") == "unanimous":
text = para["text"]
if has_materiality_language(text):
holdout_no_mat.append((pid, text))
if pid in human_labels:
holdout_no_mat_with_human.append((pid, text, human_labels[pid]))
print(f"\n Holdout paragraphs with stage1 unanimous N/O: "
f"{sum(1 for p in holdout.values() if p.get('stage1Category') == 'None/Other' and p.get('stage1Method') == 'unanimous')}")
print(f" Of those, with materiality language: {len(holdout_no_mat)}")
print(f" Of those, with human labels: {len(holdout_no_mat_with_human)}")
# What did humans call these?
if holdout_no_mat_with_human:
human_cats_for_flagged = Counter()
for pid, text, hlabels in holdout_no_mat_with_human:
for hl in hlabels:
human_cats_for_flagged[hl["category"]] += 1
print(f"\n Human labels for flagged paragraphs (Stage1=unanimous N/O, has materiality language):")
total_human = sum(human_cats_for_flagged.values())
for cat, cnt in human_cats_for_flagged.most_common():
print(f" {cat}: {cnt} ({cnt / total_human * 100:.1f}%)")
print(f"\n --- Examples where humans disagreed with Stage 1 N/O ---")
shown = 0
for pid, text, hlabels in holdout_no_mat_with_human:
non_no = [hl for hl in hlabels if hl["category"] != "None/Other"]
if non_no:
human_str = ", ".join(f"{hl['annotator']}={hl['category']}" for hl in hlabels)
print_example(shown + 1, pid, text, f"Human labels: {human_str}")
shown += 1
if shown >= 5:
break
# Also show ones where humans agreed it IS N/O
print(f"\n --- Examples where humans also said N/O (materiality language is ambiguous) ---")
shown = 0
for pid, text, hlabels in holdout_no_mat_with_human:
all_no = all(hl["category"] == "None/Other" for hl in hlabels)
if all_no and len(hlabels) >= 2:
print_example(shown + 1, pid, text, "All humans agreed: N/O")
shown += 1
if shown >= 3:
break
else:
print("\n No human labels available for flagged holdout paragraphs.")
# Broader holdout analysis: all cases where Stage 1 said N/O but humans said something else
separator("6b. HOLDOUT: ALL Stage1=N/O vs HUMAN DISAGREEMENTS")
holdout_no_all = [pid for pid, p in holdout.items()
if p.get("stage1Category") == "None/Other"]
stage1_no_human_disagree = []
for pid in holdout_no_all:
if pid in human_labels:
hlabels = human_labels[pid]
non_no = [hl for hl in hlabels if hl["category"] != "None/Other"]
if non_no:
stage1_no_human_disagree.append((pid, holdout[pid]["text"], hlabels))
print(f"\n All holdout paragraphs with Stage1=N/O (any method): {len(holdout_no_all)}")
print(f" Of those with human labels that disagree: {len(stage1_no_human_disagree)}")
if stage1_no_human_disagree:
# What did humans call them?
human_override = Counter()
for pid, text, hlabels in stage1_no_human_disagree:
for hl in hlabels:
if hl["category"] != "None/Other":
human_override[hl["category"]] += 1
print(f"\n Humans' non-N/O labels for Stage1=N/O paragraphs:")
for cat, cnt in human_override.most_common():
print(f" {cat}: {cnt}")
# ════════════════════════════════════════════════════════════════════
# 7. Other confusion axes
# ════════════════════════════════════════════════════════════════════
separator("7. OTHER CONFUSION AXES IN STAGE 1")
# 7a. Unanimous MR with program/framework/process language (potential RMP)
mr_with_process = []
for pid in unanimous_mr:
text = texts.get(pid)
if text is None:
continue
matches = PAT_PROGRAM_FRAMEWORK.findall(text)
if len(matches) >= 2: # Multiple mentions = likely process-focused
mr_with_process.append((pid, text, matches))
print(f"\n 7a. Unanimous MR with prominent program/framework/process language")
print(f" (>=2 mentions — potentially should be RMP)")
print(f" Count: {len(mr_with_process)} / {len(unanimous_mr)} ({len(mr_with_process) / max(1, len(unanimous_mr)) * 100:.1f}%)")
print(f"\n --- 3 examples ---")
for i, (pid, text, matches) in enumerate(mr_with_process[:3]):
print_example(i + 1, pid, text, f"Pattern matches: {matches[:6]}")
# 7b. Unanimous RMP with specific titles (potential MR)
rmp_with_titles = []
for pid in unanimous_rmp:
text = texts.get(pid)
if text is None:
continue
titles = PAT_TITLE.findall(text)
if titles:
rmp_with_titles.append((pid, text, titles))
print(f"\n 7b. Unanimous RMP mentioning specific people/titles")
print(f" (potentially should be MR)")
print(f" Count: {len(rmp_with_titles)} / {len(unanimous_rmp)} ({len(rmp_with_titles) / max(1, len(unanimous_rmp)) * 100:.1f}%)")
print(f"\n --- 3 examples ---")
for i, (pid, text, titles) in enumerate(rmp_with_titles[:3]):
print_example(i + 1, pid, text, f"Titles found: {titles[:5]}")
# 7c. Unanimous BG primarily about management officers
bg_about_mgmt = []
for pid in unanimous_bg:
text = texts.get(pid)
if text is None:
continue
has_titles = PAT_TITLE.findall(text)
has_mgmt = PAT_MANAGEMENT_OFFICERS.findall(text)
# If it has management language but no board language
board_pattern = re.compile(r"\b(?:board|director(?:s)?|committee|audit)\b", re.IGNORECASE)
has_board = board_pattern.findall(text)
if (has_titles or has_mgmt) and not has_board:
bg_about_mgmt.append((pid, text, has_titles + has_mgmt))
print(f"\n 7c. Unanimous BG primarily about management (no board/committee language)")
print(f" Count: {len(bg_about_mgmt)} / {len(unanimous_bg)} ({len(bg_about_mgmt) / max(1, len(unanimous_bg)) * 100:.1f}%)")
if bg_about_mgmt:
print(f"\n --- 3 examples ---")
for i, (pid, text, matches) in enumerate(bg_about_mgmt[:3]):
print_example(i + 1, pid, text, f"Matches: {matches[:5]}")
# ════════════════════════════════════════════════════════════════════
# SUMMARY
# ════════════════════════════════════════════════════════════════════
separator("SUMMARY")
print(f"""
DATASET OVERVIEW
Total paragraphs annotated (3 models each): {total_annotations:,}
Total unanimous labels: {total_unanimous:,}
Unanimous N/O: {len(unanimous_no):,}
Majority N/O (2/3): {len(majority_no):,}
PRIMARY CONCERN: N/O → SI MISCODING
Unanimous N/O with materiality language: {len(no_with_mat):,} ({len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}% of unanimous N/O)
Majority N/O with materiality language: {len(maj_no_with_mat):,} ({len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}% of majority N/O)
Unanimous N/O with insurance: {len(no_insurance):,}
Unanimous N/O with budget/investment: {len(no_budget):,}
Unanimous N/O with incident language: {len(no_incident):,}
Total potentially miscoded (deduplicated): {len(no_any_miscoded):,}
IMPACT ON TRAINING SET
% of unanimous N/O affected: {pct_no_affected:.1f}%
% of all unanimous labels affected: {pct_total_affected:.1f}%
% of all paragraphs affected: {pct_all_affected:.1f}%
OTHER CONFUSION AXES
MR ↔ RMP confusion (MR with process language): {len(mr_with_process):,} / {len(unanimous_mr):,}
RMP ↔ MR confusion (RMP with titles): {len(rmp_with_titles):,} / {len(unanimous_rmp):,}
BG about management (no board language): {len(bg_about_mgmt):,} / {len(unanimous_bg):,}
HOLDOUT VALIDATION
Stage1=unanimous N/O with materiality language: {len(holdout_no_mat):,}
Of those with human labels: {len(holdout_no_mat_with_human):,}
""")
if holdout_no_mat_with_human:
human_cats_for_flagged = Counter()
for pid, text, hlabels in holdout_no_mat_with_human:
for hl in hlabels:
human_cats_for_flagged[hl["category"]] += 1
print(" HUMAN VALIDATION (flagged holdout paragraphs):")
total_h = sum(human_cats_for_flagged.values())
for cat, cnt in human_cats_for_flagged.most_common():
print(f" {cat}: {cnt} ({cnt / total_h * 100:.1f}%)")
if __name__ == "__main__":
main()