621 lines
28 KiB
Python
621 lines
28 KiB
Python
"""
|
|
Audit Stage 1 annotations for systematic SI↔N/O miscoding.
|
|
|
|
Stage 1 used prompt v2.5 which lacked the rule "materiality disclaimers → SI."
|
|
This script quantifies how many N/O labels likely should have been SI, plus
|
|
other potential miscoding axes.
|
|
|
|
Run: uv run --with numpy scripts/audit-stage1-labels.py
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
# ── Paths ──────────────────────────────────────────────────────────────
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
ANNOTATIONS = ROOT / "data" / "annotations" / "stage1.patched.jsonl"
|
|
PARAGRAPHS = ROOT / "data" / "paragraphs" / "paragraphs-clean.patched.jsonl"
|
|
PARAGRAPHS_FALLBACK = ROOT / "data" / "paragraphs" / "paragraphs-clean.jsonl"
|
|
HOLDOUT = ROOT / "data" / "gold" / "paragraphs-holdout.jsonl"
|
|
HUMAN_LABELS = ROOT / "data" / "gold" / "human-labels-raw.jsonl"
|
|
|
|
|
|
# ── Materiality regex patterns ─────────────────────────────────────────
|
|
# Pattern 1: "material" near business/strategy language (within ~15 words)
|
|
PAT_MATERIAL_NEAR_BIZ = re.compile(
|
|
r"material(?:ly)?\b.{0,100}\b(?:business\s+strategy|results\s+of\s+operations|financial\s+condition|business|operations)"
|
|
r"|"
|
|
r"(?:business\s+strategy|results\s+of\s+operations|financial\s+condition)\b.{0,100}\baterial(?:ly)?",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Pattern 2: specific materiality disclaimer phrases
|
|
PAT_MATERIALITY_DISCLAIMER = re.compile(
|
|
r"have\s+not\s+materially\s+affected"
|
|
r"|has\s+not\s+materially\s+affected"
|
|
r"|could\s+materially\s+affect"
|
|
r"|could\s+have\s+a\s+material\s+(?:adverse\s+)?(?:effect|impact)"
|
|
r"|may\s+(?:materially|have\s+a\s+material)\s+(?:adverse\s+)?(?:effect|impact|affect)"
|
|
r"|reasonably\s+likely\s+to\s+materially\s+affect"
|
|
r"|not\s+reasonably\s+likely"
|
|
r"|materially\s+(?:adverse(?:ly)?|impact|affect)"
|
|
r"|material\s+adverse\s+(?:effect|impact)"
|
|
r"|no\s+material\s+(?:adverse\s+)?(?:effect|impact)"
|
|
r"|did\s+not\s+(?:have\s+a\s+)?material(?:ly)?\s+(?:adverse\s+)?(?:effect|impact|affect)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Pattern 3: explicit SI-relevant phrases
|
|
PAT_SI_PHRASES = re.compile(
|
|
r"business\s+strategy"
|
|
r"|results\s+of\s+operations"
|
|
r"|financial\s+condition"
|
|
r"|integrated\s+(?:into|with)\s+(?:our\s+)?(?:overall|business)"
|
|
r"|part\s+of\s+(?:our\s+)?(?:overall|broader)\s+(?:risk|enterprise|business)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def has_materiality_language(text: str) -> bool:
|
|
"""Returns True if text contains materiality-related language indicative of SI."""
|
|
return bool(
|
|
PAT_MATERIALITY_DISCLAIMER.search(text)
|
|
or PAT_SI_PHRASES.search(text)
|
|
or PAT_MATERIAL_NEAR_BIZ.search(text)
|
|
)
|
|
|
|
|
|
# ── Insurance / budget / incident patterns ─────────────────────────────
|
|
PAT_INSURANCE = re.compile(r"\binsurance\b", re.IGNORECASE)
|
|
PAT_BUDGET = re.compile(r"\b(?:budget|investment(?:s)?)\b", re.IGNORECASE)
|
|
PAT_INCIDENT = re.compile(
|
|
r"\bwe\s+(?:experienced|suffered|detected|identified|discovered|encountered|were\s+subject\s+to)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# ── Cross-category confusion patterns ──────────────────────────────────
|
|
PAT_PROGRAM_FRAMEWORK = re.compile(
|
|
r"\b(?:program|framework|process(?:es)?|procedure(?:s)?)\b", re.IGNORECASE
|
|
)
|
|
PAT_TITLE = re.compile(
|
|
r"\b(?:Chief\s+(?:Information|Technology|Executive|Financial|Security|Operating|Risk)\s+(?:Officer|Security\s+Officer))"
|
|
r"|(?:CISO|CIO|CTO|CFO|CEO|COO|CRO)\b"
|
|
r"|\b(?:Vice\s+President|Director|Senior\s+Vice\s+President|EVP|SVP)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
PAT_MANAGEMENT_OFFICERS = re.compile(
|
|
r"\b(?:management|officer(?:s)?|executive(?:s)?|leader(?:s)?(?:hip)?)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def separator(title: str) -> None:
|
|
width = 80
|
|
print()
|
|
print("=" * width)
|
|
print(f" {title}")
|
|
print("=" * width)
|
|
|
|
|
|
def print_example(idx: int, pid: str, text: str, extra: str = "") -> None:
|
|
print(f"\n [{idx}] paragraphId: {pid}")
|
|
if extra:
|
|
print(f" {extra}")
|
|
# Wrap text at ~100 chars for readability
|
|
wrapped = text
|
|
if len(wrapped) > 500:
|
|
wrapped = wrapped[:500] + "..."
|
|
print(f" TEXT: {wrapped}")
|
|
|
|
|
|
# ── Load data ──────────────────────────────────────────────────────────
|
|
def load_annotations() -> dict[str, list[dict]]:
|
|
"""Returns {paragraphId: [annotation, ...]}"""
|
|
by_para: dict[str, list[dict]] = defaultdict(list)
|
|
with open(ANNOTATIONS) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
pid = d["paragraphId"]
|
|
cat = d["label"]["content_category"]
|
|
model = d["provenance"]["modelId"]
|
|
by_para[pid].append({"category": cat, "model": model})
|
|
return dict(by_para)
|
|
|
|
|
|
def load_paragraphs() -> dict[str, str]:
|
|
"""Returns {paragraphId: text}"""
|
|
texts: dict[str, str] = {}
|
|
path = PARAGRAPHS if PARAGRAPHS.exists() else PARAGRAPHS_FALLBACK
|
|
with open(path) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
texts[d["id"]] = d["text"]
|
|
return texts
|
|
|
|
|
|
def load_holdout() -> dict[str, dict]:
|
|
"""Returns {paragraphId: {text, stage1Category, stage1Method, ...}}"""
|
|
holdout: dict[str, dict] = {}
|
|
with open(HOLDOUT) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
holdout[d["id"]] = d
|
|
return holdout
|
|
|
|
|
|
def load_human_labels() -> dict[str, list[dict]]:
|
|
"""Returns {paragraphId: [{annotatorName, contentCategory}, ...]}"""
|
|
labels: dict[str, list[dict]] = defaultdict(list)
|
|
with open(HUMAN_LABELS) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
labels[d["paragraphId"]].append(
|
|
{
|
|
"annotator": d["annotatorName"],
|
|
"category": d["contentCategory"],
|
|
"specificity": d["specificityLevel"],
|
|
}
|
|
)
|
|
return dict(labels)
|
|
|
|
|
|
def main() -> None:
|
|
print("Loading data...")
|
|
annotations = load_annotations()
|
|
texts = load_paragraphs()
|
|
holdout = load_holdout()
|
|
human_labels = load_human_labels()
|
|
|
|
print(f" Annotations: {sum(len(v) for v in annotations.values())} across {len(annotations)} paragraphs")
|
|
print(f" Paragraph texts loaded: {len(texts)}")
|
|
print(f" Holdout paragraphs: {len(holdout)}")
|
|
print(f" Human-labeled paragraphs: {len(human_labels)}")
|
|
|
|
# ── Classify each paragraph by voting ──────────────────────────────
|
|
unanimous_no: list[str] = []
|
|
majority_no: list[str] = [] # 2/3 N/O
|
|
unanimous_si: list[str] = []
|
|
unanimous_mr: list[str] = []
|
|
unanimous_rmp: list[str] = []
|
|
unanimous_bg: list[str] = []
|
|
all_unanimous: dict[str, str] = {} # pid -> category for unanimous
|
|
|
|
for pid, anns in annotations.items():
|
|
cats = [a["category"] for a in anns]
|
|
cat_counts = Counter(cats)
|
|
|
|
if len(cats) != 3:
|
|
continue # skip incomplete
|
|
|
|
if cat_counts.get("None/Other", 0) == 3:
|
|
unanimous_no.append(pid)
|
|
all_unanimous[pid] = "None/Other"
|
|
elif cat_counts.get("None/Other", 0) == 2:
|
|
majority_no.append(pid)
|
|
elif cat_counts.get("Strategy Integration", 0) == 3:
|
|
unanimous_si.append(pid)
|
|
all_unanimous[pid] = "Strategy Integration"
|
|
elif cat_counts.get("Management Role", 0) == 3:
|
|
unanimous_mr.append(pid)
|
|
all_unanimous[pid] = "Management Role"
|
|
elif cat_counts.get("Risk Management Process", 0) == 3:
|
|
unanimous_rmp.append(pid)
|
|
all_unanimous[pid] = "Risk Management Process"
|
|
elif cat_counts.get("Board Governance", 0) == 3:
|
|
unanimous_bg.append(pid)
|
|
all_unanimous[pid] = "Board Governance"
|
|
|
|
# Track all unanimous
|
|
if len(cat_counts) == 1:
|
|
all_unanimous[pid] = cats[0]
|
|
|
|
print(f"\n Unanimous N/O: {len(unanimous_no)}")
|
|
print(f" Majority N/O (2/3): {len(majority_no)}")
|
|
print(f" Unanimous SI: {len(unanimous_si)}")
|
|
print(f" Unanimous MR: {len(unanimous_mr)}")
|
|
print(f" Unanimous RMP: {len(unanimous_rmp)}")
|
|
print(f" Unanimous BG: {len(unanimous_bg)}")
|
|
print(f" Total unanimous (any): {len(all_unanimous)}")
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 1. Unanimous N/O with materiality language
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("1. UNANIMOUS N/O WITH MATERIALITY LANGUAGE")
|
|
|
|
no_with_mat: list[tuple[str, str]] = []
|
|
no_without_text = 0
|
|
for pid in unanimous_no:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
no_without_text += 1
|
|
continue
|
|
if has_materiality_language(text):
|
|
no_with_mat.append((pid, text))
|
|
|
|
print(f"\n Total unanimous N/O: {len(unanimous_no)}")
|
|
print(f" Missing text: {no_without_text}")
|
|
print(f" With materiality language: {len(no_with_mat)}")
|
|
print(f" Percentage of unanimous N/O: {len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}%")
|
|
|
|
print(f"\n --- 10 representative examples ---")
|
|
# Pick a diverse sample: take every Nth
|
|
step = max(1, len(no_with_mat) // 10)
|
|
shown = 0
|
|
for i in range(0, len(no_with_mat), step):
|
|
if shown >= 10:
|
|
break
|
|
pid, text = no_with_mat[i]
|
|
print_example(shown + 1, pid, text)
|
|
shown += 1
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 2. Majority N/O with materiality language
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("2. MAJORITY N/O (2/3) WITH MATERIALITY LANGUAGE")
|
|
|
|
maj_no_with_mat: list[tuple[str, str, str, str]] = [] # pid, text, dissenting_model, dissenting_cat
|
|
for pid in majority_no:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
continue
|
|
if has_materiality_language(text):
|
|
anns = annotations[pid]
|
|
for a in anns:
|
|
if a["category"] != "None/Other":
|
|
maj_no_with_mat.append((pid, text, a["model"], a["category"]))
|
|
break
|
|
|
|
print(f"\n Total majority N/O (2/3): {len(majority_no)}")
|
|
print(f" With materiality language: {len(maj_no_with_mat)}")
|
|
print(f" Percentage: {len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}%")
|
|
|
|
# Count dissenting categories
|
|
dissent_cats = Counter(x[3] for x in maj_no_with_mat)
|
|
print(f"\n Dissenting model voted:")
|
|
for cat, cnt in dissent_cats.most_common():
|
|
print(f" {cat}: {cnt}")
|
|
|
|
# Count dissenting models
|
|
dissent_models = Counter(x[2] for x in maj_no_with_mat)
|
|
print(f"\n Which models dissented:")
|
|
for model, cnt in dissent_models.most_common():
|
|
print(f" {model}: {cnt}")
|
|
|
|
print(f"\n --- 5 examples ---")
|
|
step = max(1, len(maj_no_with_mat) // 5)
|
|
shown = 0
|
|
for i in range(0, len(maj_no_with_mat), step):
|
|
if shown >= 5:
|
|
break
|
|
pid, text, model, cat = maj_no_with_mat[i]
|
|
print_example(shown + 1, pid, text, f"Dissent: {model} → {cat}")
|
|
shown += 1
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 3. Unanimous SI examples (contrast)
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("3. UNANIMOUS SI — WHAT CLEAN SI LOOKS LIKE")
|
|
|
|
si_examples: list[tuple[str, str]] = []
|
|
for pid in unanimous_si:
|
|
text = texts.get(pid)
|
|
if text:
|
|
si_examples.append((pid, text))
|
|
if len(si_examples) >= 20:
|
|
break
|
|
|
|
print(f"\n Total unanimous SI: {len(unanimous_si)}")
|
|
print(f"\n --- 5 examples ---")
|
|
for i, (pid, text) in enumerate(si_examples[:5]):
|
|
print_example(i + 1, pid, text)
|
|
|
|
# Analyze SI language patterns
|
|
si_has_materiality = sum(1 for pid in unanimous_si if pid in texts and has_materiality_language(texts[pid]))
|
|
si_has_insurance = sum(1 for pid in unanimous_si if pid in texts and PAT_INSURANCE.search(texts[pid]))
|
|
si_has_budget = sum(1 for pid in unanimous_si if pid in texts and PAT_BUDGET.search(texts[pid]))
|
|
print(f"\n SI language patterns:")
|
|
print(f" With materiality language: {si_has_materiality} / {len(unanimous_si)} ({si_has_materiality / max(1, len(unanimous_si)) * 100:.1f}%)")
|
|
print(f" Mention insurance: {si_has_insurance} / {len(unanimous_si)}")
|
|
print(f" Mention budget/investment: {si_has_budget} / {len(unanimous_si)}")
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 4. N/O with other potential miscoding
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("4. N/O PARAGRAPHS WITH OTHER POTENTIAL MISCODING")
|
|
|
|
no_insurance: list[tuple[str, str]] = []
|
|
no_budget: list[tuple[str, str]] = []
|
|
no_incident: list[tuple[str, str]] = []
|
|
|
|
for pid in unanimous_no:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
continue
|
|
if PAT_INSURANCE.search(text):
|
|
no_insurance.append((pid, text))
|
|
if PAT_BUDGET.search(text):
|
|
no_budget.append((pid, text))
|
|
if PAT_INCIDENT.search(text):
|
|
no_incident.append((pid, text))
|
|
|
|
print(f"\n Unanimous N/O mentioning insurance: {len(no_insurance)}")
|
|
print(f" Unanimous N/O mentioning budget/investment: {len(no_budget)}")
|
|
print(f" Unanimous N/O mentioning incidents ('we experienced...'): {len(no_incident)}")
|
|
|
|
# Show examples for each
|
|
print(f"\n --- Insurance examples (up to 3) ---")
|
|
for i, (pid, text) in enumerate(no_insurance[:3]):
|
|
print_example(i + 1, pid, text)
|
|
|
|
print(f"\n --- Budget/investment examples (up to 3) ---")
|
|
for i, (pid, text) in enumerate(no_budget[:3]):
|
|
print_example(i + 1, pid, text)
|
|
|
|
print(f"\n --- Incident examples (up to 3) ---")
|
|
for i, (pid, text) in enumerate(no_incident[:3]):
|
|
print_example(i + 1, pid, text)
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 5. Scale the problem
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("5. SCALE THE PROBLEM")
|
|
|
|
# Deduplicate: some paragraphs may hit multiple patterns
|
|
no_any_miscoded = set()
|
|
for pid, _ in no_with_mat:
|
|
no_any_miscoded.add(pid)
|
|
for pid, _ in no_insurance:
|
|
no_any_miscoded.add(pid)
|
|
for pid, _ in no_budget:
|
|
no_any_miscoded.add(pid)
|
|
no_incident_pids = set(pid for pid, _ in no_incident)
|
|
|
|
# Materiality-only (not already insurance/budget)
|
|
mat_only = set(pid for pid, _ in no_with_mat)
|
|
ins_only = set(pid for pid, _ in no_insurance) - mat_only
|
|
bud_only = set(pid for pid, _ in no_budget) - mat_only - ins_only
|
|
|
|
total_unanimous = len(all_unanimous)
|
|
total_annotations = len(annotations)
|
|
|
|
print(f"\n Total paragraphs with 3 annotations: {total_annotations}")
|
|
print(f" Total unanimous (any category): {total_unanimous}")
|
|
print(f" Total unanimous N/O: {len(unanimous_no)}")
|
|
print()
|
|
print(f" Potentially miscoded unanimous N/O:")
|
|
print(f" Materiality language (likely SI): {len(no_with_mat)}")
|
|
print(f" Insurance (likely SI): {len(no_insurance)}")
|
|
print(f" Budget/investment (likely SI): {len(no_budget)}")
|
|
print(f" Incident language (likely SI or ID): {len(no_incident)}")
|
|
print(f" Any of above (deduplicated): {len(no_any_miscoded)}")
|
|
print(f" Incident (separate concern): {len(no_incident_pids)}")
|
|
print()
|
|
|
|
# Overlap analysis
|
|
mat_set = set(pid for pid, _ in no_with_mat)
|
|
ins_set = set(pid for pid, _ in no_insurance)
|
|
bud_set = set(pid for pid, _ in no_budget)
|
|
print(f" Overlap analysis:")
|
|
print(f" Materiality ∩ Insurance: {len(mat_set & ins_set)}")
|
|
print(f" Materiality ∩ Budget: {len(mat_set & bud_set)}")
|
|
print(f" Insurance ∩ Budget: {len(ins_set & bud_set)}")
|
|
print()
|
|
|
|
pct_no_affected = len(no_any_miscoded) / max(1, len(unanimous_no)) * 100
|
|
pct_total_affected = len(no_any_miscoded) / max(1, total_unanimous) * 100
|
|
pct_all_affected = len(no_any_miscoded) / max(1, total_annotations) * 100
|
|
|
|
print(f" Impact estimates:")
|
|
print(f" % of unanimous N/O potentially miscoded: {pct_no_affected:.1f}%")
|
|
print(f" % of all unanimous labels affected: {pct_total_affected:.1f}%")
|
|
print(f" % of all paragraphs affected: {pct_all_affected:.1f}%")
|
|
|
|
# Also check majority N/O
|
|
maj_no_any = set()
|
|
for pid in majority_no:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
continue
|
|
if has_materiality_language(text) or PAT_INSURANCE.search(text) or PAT_BUDGET.search(text):
|
|
maj_no_any.add(pid)
|
|
|
|
print(f"\n Majority N/O (2/3) potentially miscoded: {len(maj_no_any)} / {len(majority_no)}")
|
|
print(f" Combined (unanimous + majority) potentially miscoded N/O: {len(no_any_miscoded) + len(maj_no_any)}")
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 6. Cross-check with holdout / human labels
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("6. HOLDOUT CROSS-CHECK WITH HUMAN LABELS")
|
|
|
|
# Find holdout paragraphs that Stage 1 unanimously called N/O but contain materiality language
|
|
holdout_no_mat: list[tuple[str, str]] = []
|
|
holdout_no_mat_with_human: list[tuple[str, str, list[dict]]] = []
|
|
|
|
for pid, para in holdout.items():
|
|
if para.get("stage1Category") == "None/Other" and para.get("stage1Method") == "unanimous":
|
|
text = para["text"]
|
|
if has_materiality_language(text):
|
|
holdout_no_mat.append((pid, text))
|
|
if pid in human_labels:
|
|
holdout_no_mat_with_human.append((pid, text, human_labels[pid]))
|
|
|
|
print(f"\n Holdout paragraphs with stage1 unanimous N/O: "
|
|
f"{sum(1 for p in holdout.values() if p.get('stage1Category') == 'None/Other' and p.get('stage1Method') == 'unanimous')}")
|
|
print(f" Of those, with materiality language: {len(holdout_no_mat)}")
|
|
print(f" Of those, with human labels: {len(holdout_no_mat_with_human)}")
|
|
|
|
# What did humans call these?
|
|
if holdout_no_mat_with_human:
|
|
human_cats_for_flagged = Counter()
|
|
for pid, text, hlabels in holdout_no_mat_with_human:
|
|
for hl in hlabels:
|
|
human_cats_for_flagged[hl["category"]] += 1
|
|
|
|
print(f"\n Human labels for flagged paragraphs (Stage1=unanimous N/O, has materiality language):")
|
|
total_human = sum(human_cats_for_flagged.values())
|
|
for cat, cnt in human_cats_for_flagged.most_common():
|
|
print(f" {cat}: {cnt} ({cnt / total_human * 100:.1f}%)")
|
|
|
|
print(f"\n --- Examples where humans disagreed with Stage 1 N/O ---")
|
|
shown = 0
|
|
for pid, text, hlabels in holdout_no_mat_with_human:
|
|
non_no = [hl for hl in hlabels if hl["category"] != "None/Other"]
|
|
if non_no:
|
|
human_str = ", ".join(f"{hl['annotator']}={hl['category']}" for hl in hlabels)
|
|
print_example(shown + 1, pid, text, f"Human labels: {human_str}")
|
|
shown += 1
|
|
if shown >= 5:
|
|
break
|
|
|
|
# Also show ones where humans agreed it IS N/O
|
|
print(f"\n --- Examples where humans also said N/O (materiality language is ambiguous) ---")
|
|
shown = 0
|
|
for pid, text, hlabels in holdout_no_mat_with_human:
|
|
all_no = all(hl["category"] == "None/Other" for hl in hlabels)
|
|
if all_no and len(hlabels) >= 2:
|
|
print_example(shown + 1, pid, text, "All humans agreed: N/O")
|
|
shown += 1
|
|
if shown >= 3:
|
|
break
|
|
else:
|
|
print("\n No human labels available for flagged holdout paragraphs.")
|
|
|
|
# Broader holdout analysis: all cases where Stage 1 said N/O but humans said something else
|
|
separator("6b. HOLDOUT: ALL Stage1=N/O vs HUMAN DISAGREEMENTS")
|
|
|
|
holdout_no_all = [pid for pid, p in holdout.items()
|
|
if p.get("stage1Category") == "None/Other"]
|
|
stage1_no_human_disagree = []
|
|
for pid in holdout_no_all:
|
|
if pid in human_labels:
|
|
hlabels = human_labels[pid]
|
|
non_no = [hl for hl in hlabels if hl["category"] != "None/Other"]
|
|
if non_no:
|
|
stage1_no_human_disagree.append((pid, holdout[pid]["text"], hlabels))
|
|
|
|
print(f"\n All holdout paragraphs with Stage1=N/O (any method): {len(holdout_no_all)}")
|
|
print(f" Of those with human labels that disagree: {len(stage1_no_human_disagree)}")
|
|
|
|
if stage1_no_human_disagree:
|
|
# What did humans call them?
|
|
human_override = Counter()
|
|
for pid, text, hlabels in stage1_no_human_disagree:
|
|
for hl in hlabels:
|
|
if hl["category"] != "None/Other":
|
|
human_override[hl["category"]] += 1
|
|
print(f"\n Humans' non-N/O labels for Stage1=N/O paragraphs:")
|
|
for cat, cnt in human_override.most_common():
|
|
print(f" {cat}: {cnt}")
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# 7. Other confusion axes
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("7. OTHER CONFUSION AXES IN STAGE 1")
|
|
|
|
# 7a. Unanimous MR with program/framework/process language (potential RMP)
|
|
mr_with_process = []
|
|
for pid in unanimous_mr:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
continue
|
|
matches = PAT_PROGRAM_FRAMEWORK.findall(text)
|
|
if len(matches) >= 2: # Multiple mentions = likely process-focused
|
|
mr_with_process.append((pid, text, matches))
|
|
|
|
print(f"\n 7a. Unanimous MR with prominent program/framework/process language")
|
|
print(f" (>=2 mentions — potentially should be RMP)")
|
|
print(f" Count: {len(mr_with_process)} / {len(unanimous_mr)} ({len(mr_with_process) / max(1, len(unanimous_mr)) * 100:.1f}%)")
|
|
print(f"\n --- 3 examples ---")
|
|
for i, (pid, text, matches) in enumerate(mr_with_process[:3]):
|
|
print_example(i + 1, pid, text, f"Pattern matches: {matches[:6]}")
|
|
|
|
# 7b. Unanimous RMP with specific titles (potential MR)
|
|
rmp_with_titles = []
|
|
for pid in unanimous_rmp:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
continue
|
|
titles = PAT_TITLE.findall(text)
|
|
if titles:
|
|
rmp_with_titles.append((pid, text, titles))
|
|
|
|
print(f"\n 7b. Unanimous RMP mentioning specific people/titles")
|
|
print(f" (potentially should be MR)")
|
|
print(f" Count: {len(rmp_with_titles)} / {len(unanimous_rmp)} ({len(rmp_with_titles) / max(1, len(unanimous_rmp)) * 100:.1f}%)")
|
|
print(f"\n --- 3 examples ---")
|
|
for i, (pid, text, titles) in enumerate(rmp_with_titles[:3]):
|
|
print_example(i + 1, pid, text, f"Titles found: {titles[:5]}")
|
|
|
|
# 7c. Unanimous BG primarily about management officers
|
|
bg_about_mgmt = []
|
|
for pid in unanimous_bg:
|
|
text = texts.get(pid)
|
|
if text is None:
|
|
continue
|
|
has_titles = PAT_TITLE.findall(text)
|
|
has_mgmt = PAT_MANAGEMENT_OFFICERS.findall(text)
|
|
# If it has management language but no board language
|
|
board_pattern = re.compile(r"\b(?:board|director(?:s)?|committee|audit)\b", re.IGNORECASE)
|
|
has_board = board_pattern.findall(text)
|
|
if (has_titles or has_mgmt) and not has_board:
|
|
bg_about_mgmt.append((pid, text, has_titles + has_mgmt))
|
|
|
|
print(f"\n 7c. Unanimous BG primarily about management (no board/committee language)")
|
|
print(f" Count: {len(bg_about_mgmt)} / {len(unanimous_bg)} ({len(bg_about_mgmt) / max(1, len(unanimous_bg)) * 100:.1f}%)")
|
|
if bg_about_mgmt:
|
|
print(f"\n --- 3 examples ---")
|
|
for i, (pid, text, matches) in enumerate(bg_about_mgmt[:3]):
|
|
print_example(i + 1, pid, text, f"Matches: {matches[:5]}")
|
|
|
|
# ════════════════════════════════════════════════════════════════════
|
|
# SUMMARY
|
|
# ════════════════════════════════════════════════════════════════════
|
|
separator("SUMMARY")
|
|
|
|
print(f"""
|
|
DATASET OVERVIEW
|
|
Total paragraphs annotated (3 models each): {total_annotations:,}
|
|
Total unanimous labels: {total_unanimous:,}
|
|
Unanimous N/O: {len(unanimous_no):,}
|
|
Majority N/O (2/3): {len(majority_no):,}
|
|
|
|
PRIMARY CONCERN: N/O → SI MISCODING
|
|
Unanimous N/O with materiality language: {len(no_with_mat):,} ({len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}% of unanimous N/O)
|
|
Majority N/O with materiality language: {len(maj_no_with_mat):,} ({len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}% of majority N/O)
|
|
Unanimous N/O with insurance: {len(no_insurance):,}
|
|
Unanimous N/O with budget/investment: {len(no_budget):,}
|
|
Unanimous N/O with incident language: {len(no_incident):,}
|
|
Total potentially miscoded (deduplicated): {len(no_any_miscoded):,}
|
|
|
|
IMPACT ON TRAINING SET
|
|
% of unanimous N/O affected: {pct_no_affected:.1f}%
|
|
% of all unanimous labels affected: {pct_total_affected:.1f}%
|
|
% of all paragraphs affected: {pct_all_affected:.1f}%
|
|
|
|
OTHER CONFUSION AXES
|
|
MR ↔ RMP confusion (MR with process language): {len(mr_with_process):,} / {len(unanimous_mr):,}
|
|
RMP ↔ MR confusion (RMP with titles): {len(rmp_with_titles):,} / {len(unanimous_rmp):,}
|
|
BG about management (no board language): {len(bg_about_mgmt):,} / {len(unanimous_bg):,}
|
|
|
|
HOLDOUT VALIDATION
|
|
Stage1=unanimous N/O with materiality language: {len(holdout_no_mat):,}
|
|
Of those with human labels: {len(holdout_no_mat_with_human):,}
|
|
""")
|
|
|
|
if holdout_no_mat_with_human:
|
|
human_cats_for_flagged = Counter()
|
|
for pid, text, hlabels in holdout_no_mat_with_human:
|
|
for hl in hlabels:
|
|
human_cats_for_flagged[hl["category"]] += 1
|
|
print(" HUMAN VALIDATION (flagged holdout paragraphs):")
|
|
total_h = sum(human_cats_for_flagged.values())
|
|
for cat, cnt in human_cats_for_flagged.most_common():
|
|
print(f" {cat}: {cnt} ({cnt / total_h * 100:.1f}%)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|