""" Audit Stage 1 annotations for systematic SI↔N/O miscoding. Stage 1 used prompt v2.5 which lacked the rule "materiality disclaimers → SI." This script quantifies how many N/O labels likely should have been SI, plus other potential miscoding axes. Run: uv run --with numpy scripts/audit-stage1-labels.py """ import json import re import sys from collections import Counter, defaultdict from pathlib import Path # ── Paths ────────────────────────────────────────────────────────────── ROOT = Path(__file__).resolve().parent.parent ANNOTATIONS = ROOT / "data" / "annotations" / "stage1.patched.jsonl" PARAGRAPHS = ROOT / "data" / "paragraphs" / "paragraphs-clean.patched.jsonl" PARAGRAPHS_FALLBACK = ROOT / "data" / "paragraphs" / "paragraphs-clean.jsonl" HOLDOUT = ROOT / "data" / "gold" / "paragraphs-holdout.jsonl" HUMAN_LABELS = ROOT / "data" / "gold" / "human-labels-raw.jsonl" # ── Materiality regex patterns ───────────────────────────────────────── # Pattern 1: "material" near business/strategy language (within ~15 words) PAT_MATERIAL_NEAR_BIZ = re.compile( r"material(?:ly)?\b.{0,100}\b(?:business\s+strategy|results\s+of\s+operations|financial\s+condition|business|operations)" r"|" r"(?:business\s+strategy|results\s+of\s+operations|financial\s+condition)\b.{0,100}\baterial(?:ly)?", re.IGNORECASE, ) # Pattern 2: specific materiality disclaimer phrases PAT_MATERIALITY_DISCLAIMER = re.compile( r"have\s+not\s+materially\s+affected" r"|has\s+not\s+materially\s+affected" r"|could\s+materially\s+affect" r"|could\s+have\s+a\s+material\s+(?:adverse\s+)?(?:effect|impact)" r"|may\s+(?:materially|have\s+a\s+material)\s+(?:adverse\s+)?(?:effect|impact|affect)" r"|reasonably\s+likely\s+to\s+materially\s+affect" r"|not\s+reasonably\s+likely" r"|materially\s+(?:adverse(?:ly)?|impact|affect)" r"|material\s+adverse\s+(?:effect|impact)" r"|no\s+material\s+(?:adverse\s+)?(?:effect|impact)" r"|did\s+not\s+(?:have\s+a\s+)?material(?:ly)?\s+(?:adverse\s+)?(?:effect|impact|affect)", re.IGNORECASE, ) # Pattern 3: explicit SI-relevant phrases PAT_SI_PHRASES = re.compile( r"business\s+strategy" r"|results\s+of\s+operations" r"|financial\s+condition" r"|integrated\s+(?:into|with)\s+(?:our\s+)?(?:overall|business)" r"|part\s+of\s+(?:our\s+)?(?:overall|broader)\s+(?:risk|enterprise|business)", re.IGNORECASE, ) def has_materiality_language(text: str) -> bool: """Returns True if text contains materiality-related language indicative of SI.""" return bool( PAT_MATERIALITY_DISCLAIMER.search(text) or PAT_SI_PHRASES.search(text) or PAT_MATERIAL_NEAR_BIZ.search(text) ) # ── Insurance / budget / incident patterns ───────────────────────────── PAT_INSURANCE = re.compile(r"\binsurance\b", re.IGNORECASE) PAT_BUDGET = re.compile(r"\b(?:budget|investment(?:s)?)\b", re.IGNORECASE) PAT_INCIDENT = re.compile( r"\bwe\s+(?:experienced|suffered|detected|identified|discovered|encountered|were\s+subject\s+to)\b", re.IGNORECASE, ) # ── Cross-category confusion patterns ────────────────────────────────── PAT_PROGRAM_FRAMEWORK = re.compile( r"\b(?:program|framework|process(?:es)?|procedure(?:s)?)\b", re.IGNORECASE ) PAT_TITLE = re.compile( r"\b(?:Chief\s+(?:Information|Technology|Executive|Financial|Security|Operating|Risk)\s+(?:Officer|Security\s+Officer))" r"|(?:CISO|CIO|CTO|CFO|CEO|COO|CRO)\b" r"|\b(?:Vice\s+President|Director|Senior\s+Vice\s+President|EVP|SVP)\b", re.IGNORECASE, ) PAT_MANAGEMENT_OFFICERS = re.compile( r"\b(?:management|officer(?:s)?|executive(?:s)?|leader(?:s)?(?:hip)?)\b", re.IGNORECASE, ) def separator(title: str) -> None: width = 80 print() print("=" * width) print(f" {title}") print("=" * width) def print_example(idx: int, pid: str, text: str, extra: str = "") -> None: print(f"\n [{idx}] paragraphId: {pid}") if extra: print(f" {extra}") # Wrap text at ~100 chars for readability wrapped = text if len(wrapped) > 500: wrapped = wrapped[:500] + "..." print(f" TEXT: {wrapped}") # ── Load data ────────────────────────────────────────────────────────── def load_annotations() -> dict[str, list[dict]]: """Returns {paragraphId: [annotation, ...]}""" by_para: dict[str, list[dict]] = defaultdict(list) with open(ANNOTATIONS) as f: for line in f: d = json.loads(line) pid = d["paragraphId"] cat = d["label"]["content_category"] model = d["provenance"]["modelId"] by_para[pid].append({"category": cat, "model": model}) return dict(by_para) def load_paragraphs() -> dict[str, str]: """Returns {paragraphId: text}""" texts: dict[str, str] = {} path = PARAGRAPHS if PARAGRAPHS.exists() else PARAGRAPHS_FALLBACK with open(path) as f: for line in f: d = json.loads(line) texts[d["id"]] = d["text"] return texts def load_holdout() -> dict[str, dict]: """Returns {paragraphId: {text, stage1Category, stage1Method, ...}}""" holdout: dict[str, dict] = {} with open(HOLDOUT) as f: for line in f: d = json.loads(line) holdout[d["id"]] = d return holdout def load_human_labels() -> dict[str, list[dict]]: """Returns {paragraphId: [{annotatorName, contentCategory}, ...]}""" labels: dict[str, list[dict]] = defaultdict(list) with open(HUMAN_LABELS) as f: for line in f: d = json.loads(line) labels[d["paragraphId"]].append( { "annotator": d["annotatorName"], "category": d["contentCategory"], "specificity": d["specificityLevel"], } ) return dict(labels) def main() -> None: print("Loading data...") annotations = load_annotations() texts = load_paragraphs() holdout = load_holdout() human_labels = load_human_labels() print(f" Annotations: {sum(len(v) for v in annotations.values())} across {len(annotations)} paragraphs") print(f" Paragraph texts loaded: {len(texts)}") print(f" Holdout paragraphs: {len(holdout)}") print(f" Human-labeled paragraphs: {len(human_labels)}") # ── Classify each paragraph by voting ────────────────────────────── unanimous_no: list[str] = [] majority_no: list[str] = [] # 2/3 N/O unanimous_si: list[str] = [] unanimous_mr: list[str] = [] unanimous_rmp: list[str] = [] unanimous_bg: list[str] = [] all_unanimous: dict[str, str] = {} # pid -> category for unanimous for pid, anns in annotations.items(): cats = [a["category"] for a in anns] cat_counts = Counter(cats) if len(cats) != 3: continue # skip incomplete if cat_counts.get("None/Other", 0) == 3: unanimous_no.append(pid) all_unanimous[pid] = "None/Other" elif cat_counts.get("None/Other", 0) == 2: majority_no.append(pid) elif cat_counts.get("Strategy Integration", 0) == 3: unanimous_si.append(pid) all_unanimous[pid] = "Strategy Integration" elif cat_counts.get("Management Role", 0) == 3: unanimous_mr.append(pid) all_unanimous[pid] = "Management Role" elif cat_counts.get("Risk Management Process", 0) == 3: unanimous_rmp.append(pid) all_unanimous[pid] = "Risk Management Process" elif cat_counts.get("Board Governance", 0) == 3: unanimous_bg.append(pid) all_unanimous[pid] = "Board Governance" # Track all unanimous if len(cat_counts) == 1: all_unanimous[pid] = cats[0] print(f"\n Unanimous N/O: {len(unanimous_no)}") print(f" Majority N/O (2/3): {len(majority_no)}") print(f" Unanimous SI: {len(unanimous_si)}") print(f" Unanimous MR: {len(unanimous_mr)}") print(f" Unanimous RMP: {len(unanimous_rmp)}") print(f" Unanimous BG: {len(unanimous_bg)}") print(f" Total unanimous (any): {len(all_unanimous)}") # ════════════════════════════════════════════════════════════════════ # 1. Unanimous N/O with materiality language # ════════════════════════════════════════════════════════════════════ separator("1. UNANIMOUS N/O WITH MATERIALITY LANGUAGE") no_with_mat: list[tuple[str, str]] = [] no_without_text = 0 for pid in unanimous_no: text = texts.get(pid) if text is None: no_without_text += 1 continue if has_materiality_language(text): no_with_mat.append((pid, text)) print(f"\n Total unanimous N/O: {len(unanimous_no)}") print(f" Missing text: {no_without_text}") print(f" With materiality language: {len(no_with_mat)}") print(f" Percentage of unanimous N/O: {len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}%") print(f"\n --- 10 representative examples ---") # Pick a diverse sample: take every Nth step = max(1, len(no_with_mat) // 10) shown = 0 for i in range(0, len(no_with_mat), step): if shown >= 10: break pid, text = no_with_mat[i] print_example(shown + 1, pid, text) shown += 1 # ════════════════════════════════════════════════════════════════════ # 2. Majority N/O with materiality language # ════════════════════════════════════════════════════════════════════ separator("2. MAJORITY N/O (2/3) WITH MATERIALITY LANGUAGE") maj_no_with_mat: list[tuple[str, str, str, str]] = [] # pid, text, dissenting_model, dissenting_cat for pid in majority_no: text = texts.get(pid) if text is None: continue if has_materiality_language(text): anns = annotations[pid] for a in anns: if a["category"] != "None/Other": maj_no_with_mat.append((pid, text, a["model"], a["category"])) break print(f"\n Total majority N/O (2/3): {len(majority_no)}") print(f" With materiality language: {len(maj_no_with_mat)}") print(f" Percentage: {len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}%") # Count dissenting categories dissent_cats = Counter(x[3] for x in maj_no_with_mat) print(f"\n Dissenting model voted:") for cat, cnt in dissent_cats.most_common(): print(f" {cat}: {cnt}") # Count dissenting models dissent_models = Counter(x[2] for x in maj_no_with_mat) print(f"\n Which models dissented:") for model, cnt in dissent_models.most_common(): print(f" {model}: {cnt}") print(f"\n --- 5 examples ---") step = max(1, len(maj_no_with_mat) // 5) shown = 0 for i in range(0, len(maj_no_with_mat), step): if shown >= 5: break pid, text, model, cat = maj_no_with_mat[i] print_example(shown + 1, pid, text, f"Dissent: {model} → {cat}") shown += 1 # ════════════════════════════════════════════════════════════════════ # 3. Unanimous SI examples (contrast) # ════════════════════════════════════════════════════════════════════ separator("3. UNANIMOUS SI — WHAT CLEAN SI LOOKS LIKE") si_examples: list[tuple[str, str]] = [] for pid in unanimous_si: text = texts.get(pid) if text: si_examples.append((pid, text)) if len(si_examples) >= 20: break print(f"\n Total unanimous SI: {len(unanimous_si)}") print(f"\n --- 5 examples ---") for i, (pid, text) in enumerate(si_examples[:5]): print_example(i + 1, pid, text) # Analyze SI language patterns si_has_materiality = sum(1 for pid in unanimous_si if pid in texts and has_materiality_language(texts[pid])) si_has_insurance = sum(1 for pid in unanimous_si if pid in texts and PAT_INSURANCE.search(texts[pid])) si_has_budget = sum(1 for pid in unanimous_si if pid in texts and PAT_BUDGET.search(texts[pid])) print(f"\n SI language patterns:") print(f" With materiality language: {si_has_materiality} / {len(unanimous_si)} ({si_has_materiality / max(1, len(unanimous_si)) * 100:.1f}%)") print(f" Mention insurance: {si_has_insurance} / {len(unanimous_si)}") print(f" Mention budget/investment: {si_has_budget} / {len(unanimous_si)}") # ════════════════════════════════════════════════════════════════════ # 4. N/O with other potential miscoding # ════════════════════════════════════════════════════════════════════ separator("4. N/O PARAGRAPHS WITH OTHER POTENTIAL MISCODING") no_insurance: list[tuple[str, str]] = [] no_budget: list[tuple[str, str]] = [] no_incident: list[tuple[str, str]] = [] for pid in unanimous_no: text = texts.get(pid) if text is None: continue if PAT_INSURANCE.search(text): no_insurance.append((pid, text)) if PAT_BUDGET.search(text): no_budget.append((pid, text)) if PAT_INCIDENT.search(text): no_incident.append((pid, text)) print(f"\n Unanimous N/O mentioning insurance: {len(no_insurance)}") print(f" Unanimous N/O mentioning budget/investment: {len(no_budget)}") print(f" Unanimous N/O mentioning incidents ('we experienced...'): {len(no_incident)}") # Show examples for each print(f"\n --- Insurance examples (up to 3) ---") for i, (pid, text) in enumerate(no_insurance[:3]): print_example(i + 1, pid, text) print(f"\n --- Budget/investment examples (up to 3) ---") for i, (pid, text) in enumerate(no_budget[:3]): print_example(i + 1, pid, text) print(f"\n --- Incident examples (up to 3) ---") for i, (pid, text) in enumerate(no_incident[:3]): print_example(i + 1, pid, text) # ════════════════════════════════════════════════════════════════════ # 5. Scale the problem # ════════════════════════════════════════════════════════════════════ separator("5. SCALE THE PROBLEM") # Deduplicate: some paragraphs may hit multiple patterns no_any_miscoded = set() for pid, _ in no_with_mat: no_any_miscoded.add(pid) for pid, _ in no_insurance: no_any_miscoded.add(pid) for pid, _ in no_budget: no_any_miscoded.add(pid) no_incident_pids = set(pid for pid, _ in no_incident) # Materiality-only (not already insurance/budget) mat_only = set(pid for pid, _ in no_with_mat) ins_only = set(pid for pid, _ in no_insurance) - mat_only bud_only = set(pid for pid, _ in no_budget) - mat_only - ins_only total_unanimous = len(all_unanimous) total_annotations = len(annotations) print(f"\n Total paragraphs with 3 annotations: {total_annotations}") print(f" Total unanimous (any category): {total_unanimous}") print(f" Total unanimous N/O: {len(unanimous_no)}") print() print(f" Potentially miscoded unanimous N/O:") print(f" Materiality language (likely SI): {len(no_with_mat)}") print(f" Insurance (likely SI): {len(no_insurance)}") print(f" Budget/investment (likely SI): {len(no_budget)}") print(f" Incident language (likely SI or ID): {len(no_incident)}") print(f" Any of above (deduplicated): {len(no_any_miscoded)}") print(f" Incident (separate concern): {len(no_incident_pids)}") print() # Overlap analysis mat_set = set(pid for pid, _ in no_with_mat) ins_set = set(pid for pid, _ in no_insurance) bud_set = set(pid for pid, _ in no_budget) print(f" Overlap analysis:") print(f" Materiality ∩ Insurance: {len(mat_set & ins_set)}") print(f" Materiality ∩ Budget: {len(mat_set & bud_set)}") print(f" Insurance ∩ Budget: {len(ins_set & bud_set)}") print() pct_no_affected = len(no_any_miscoded) / max(1, len(unanimous_no)) * 100 pct_total_affected = len(no_any_miscoded) / max(1, total_unanimous) * 100 pct_all_affected = len(no_any_miscoded) / max(1, total_annotations) * 100 print(f" Impact estimates:") print(f" % of unanimous N/O potentially miscoded: {pct_no_affected:.1f}%") print(f" % of all unanimous labels affected: {pct_total_affected:.1f}%") print(f" % of all paragraphs affected: {pct_all_affected:.1f}%") # Also check majority N/O maj_no_any = set() for pid in majority_no: text = texts.get(pid) if text is None: continue if has_materiality_language(text) or PAT_INSURANCE.search(text) or PAT_BUDGET.search(text): maj_no_any.add(pid) print(f"\n Majority N/O (2/3) potentially miscoded: {len(maj_no_any)} / {len(majority_no)}") print(f" Combined (unanimous + majority) potentially miscoded N/O: {len(no_any_miscoded) + len(maj_no_any)}") # ════════════════════════════════════════════════════════════════════ # 6. Cross-check with holdout / human labels # ════════════════════════════════════════════════════════════════════ separator("6. HOLDOUT CROSS-CHECK WITH HUMAN LABELS") # Find holdout paragraphs that Stage 1 unanimously called N/O but contain materiality language holdout_no_mat: list[tuple[str, str]] = [] holdout_no_mat_with_human: list[tuple[str, str, list[dict]]] = [] for pid, para in holdout.items(): if para.get("stage1Category") == "None/Other" and para.get("stage1Method") == "unanimous": text = para["text"] if has_materiality_language(text): holdout_no_mat.append((pid, text)) if pid in human_labels: holdout_no_mat_with_human.append((pid, text, human_labels[pid])) print(f"\n Holdout paragraphs with stage1 unanimous N/O: " f"{sum(1 for p in holdout.values() if p.get('stage1Category') == 'None/Other' and p.get('stage1Method') == 'unanimous')}") print(f" Of those, with materiality language: {len(holdout_no_mat)}") print(f" Of those, with human labels: {len(holdout_no_mat_with_human)}") # What did humans call these? if holdout_no_mat_with_human: human_cats_for_flagged = Counter() for pid, text, hlabels in holdout_no_mat_with_human: for hl in hlabels: human_cats_for_flagged[hl["category"]] += 1 print(f"\n Human labels for flagged paragraphs (Stage1=unanimous N/O, has materiality language):") total_human = sum(human_cats_for_flagged.values()) for cat, cnt in human_cats_for_flagged.most_common(): print(f" {cat}: {cnt} ({cnt / total_human * 100:.1f}%)") print(f"\n --- Examples where humans disagreed with Stage 1 N/O ---") shown = 0 for pid, text, hlabels in holdout_no_mat_with_human: non_no = [hl for hl in hlabels if hl["category"] != "None/Other"] if non_no: human_str = ", ".join(f"{hl['annotator']}={hl['category']}" for hl in hlabels) print_example(shown + 1, pid, text, f"Human labels: {human_str}") shown += 1 if shown >= 5: break # Also show ones where humans agreed it IS N/O print(f"\n --- Examples where humans also said N/O (materiality language is ambiguous) ---") shown = 0 for pid, text, hlabels in holdout_no_mat_with_human: all_no = all(hl["category"] == "None/Other" for hl in hlabels) if all_no and len(hlabels) >= 2: print_example(shown + 1, pid, text, "All humans agreed: N/O") shown += 1 if shown >= 3: break else: print("\n No human labels available for flagged holdout paragraphs.") # Broader holdout analysis: all cases where Stage 1 said N/O but humans said something else separator("6b. HOLDOUT: ALL Stage1=N/O vs HUMAN DISAGREEMENTS") holdout_no_all = [pid for pid, p in holdout.items() if p.get("stage1Category") == "None/Other"] stage1_no_human_disagree = [] for pid in holdout_no_all: if pid in human_labels: hlabels = human_labels[pid] non_no = [hl for hl in hlabels if hl["category"] != "None/Other"] if non_no: stage1_no_human_disagree.append((pid, holdout[pid]["text"], hlabels)) print(f"\n All holdout paragraphs with Stage1=N/O (any method): {len(holdout_no_all)}") print(f" Of those with human labels that disagree: {len(stage1_no_human_disagree)}") if stage1_no_human_disagree: # What did humans call them? human_override = Counter() for pid, text, hlabels in stage1_no_human_disagree: for hl in hlabels: if hl["category"] != "None/Other": human_override[hl["category"]] += 1 print(f"\n Humans' non-N/O labels for Stage1=N/O paragraphs:") for cat, cnt in human_override.most_common(): print(f" {cat}: {cnt}") # ════════════════════════════════════════════════════════════════════ # 7. Other confusion axes # ════════════════════════════════════════════════════════════════════ separator("7. OTHER CONFUSION AXES IN STAGE 1") # 7a. Unanimous MR with program/framework/process language (potential RMP) mr_with_process = [] for pid in unanimous_mr: text = texts.get(pid) if text is None: continue matches = PAT_PROGRAM_FRAMEWORK.findall(text) if len(matches) >= 2: # Multiple mentions = likely process-focused mr_with_process.append((pid, text, matches)) print(f"\n 7a. Unanimous MR with prominent program/framework/process language") print(f" (>=2 mentions — potentially should be RMP)") print(f" Count: {len(mr_with_process)} / {len(unanimous_mr)} ({len(mr_with_process) / max(1, len(unanimous_mr)) * 100:.1f}%)") print(f"\n --- 3 examples ---") for i, (pid, text, matches) in enumerate(mr_with_process[:3]): print_example(i + 1, pid, text, f"Pattern matches: {matches[:6]}") # 7b. Unanimous RMP with specific titles (potential MR) rmp_with_titles = [] for pid in unanimous_rmp: text = texts.get(pid) if text is None: continue titles = PAT_TITLE.findall(text) if titles: rmp_with_titles.append((pid, text, titles)) print(f"\n 7b. Unanimous RMP mentioning specific people/titles") print(f" (potentially should be MR)") print(f" Count: {len(rmp_with_titles)} / {len(unanimous_rmp)} ({len(rmp_with_titles) / max(1, len(unanimous_rmp)) * 100:.1f}%)") print(f"\n --- 3 examples ---") for i, (pid, text, titles) in enumerate(rmp_with_titles[:3]): print_example(i + 1, pid, text, f"Titles found: {titles[:5]}") # 7c. Unanimous BG primarily about management officers bg_about_mgmt = [] for pid in unanimous_bg: text = texts.get(pid) if text is None: continue has_titles = PAT_TITLE.findall(text) has_mgmt = PAT_MANAGEMENT_OFFICERS.findall(text) # If it has management language but no board language board_pattern = re.compile(r"\b(?:board|director(?:s)?|committee|audit)\b", re.IGNORECASE) has_board = board_pattern.findall(text) if (has_titles or has_mgmt) and not has_board: bg_about_mgmt.append((pid, text, has_titles + has_mgmt)) print(f"\n 7c. Unanimous BG primarily about management (no board/committee language)") print(f" Count: {len(bg_about_mgmt)} / {len(unanimous_bg)} ({len(bg_about_mgmt) / max(1, len(unanimous_bg)) * 100:.1f}%)") if bg_about_mgmt: print(f"\n --- 3 examples ---") for i, (pid, text, matches) in enumerate(bg_about_mgmt[:3]): print_example(i + 1, pid, text, f"Matches: {matches[:5]}") # ════════════════════════════════════════════════════════════════════ # SUMMARY # ════════════════════════════════════════════════════════════════════ separator("SUMMARY") print(f""" DATASET OVERVIEW Total paragraphs annotated (3 models each): {total_annotations:,} Total unanimous labels: {total_unanimous:,} Unanimous N/O: {len(unanimous_no):,} Majority N/O (2/3): {len(majority_no):,} PRIMARY CONCERN: N/O → SI MISCODING Unanimous N/O with materiality language: {len(no_with_mat):,} ({len(no_with_mat) / max(1, len(unanimous_no)) * 100:.1f}% of unanimous N/O) Majority N/O with materiality language: {len(maj_no_with_mat):,} ({len(maj_no_with_mat) / max(1, len(majority_no)) * 100:.1f}% of majority N/O) Unanimous N/O with insurance: {len(no_insurance):,} Unanimous N/O with budget/investment: {len(no_budget):,} Unanimous N/O with incident language: {len(no_incident):,} Total potentially miscoded (deduplicated): {len(no_any_miscoded):,} IMPACT ON TRAINING SET % of unanimous N/O affected: {pct_no_affected:.1f}% % of all unanimous labels affected: {pct_total_affected:.1f}% % of all paragraphs affected: {pct_all_affected:.1f}% OTHER CONFUSION AXES MR ↔ RMP confusion (MR with process language): {len(mr_with_process):,} / {len(unanimous_mr):,} RMP ↔ MR confusion (RMP with titles): {len(rmp_with_titles):,} / {len(unanimous_rmp):,} BG about management (no board language): {len(bg_about_mgmt):,} / {len(unanimous_bg):,} HOLDOUT VALIDATION Stage1=unanimous N/O with materiality language: {len(holdout_no_mat):,} Of those with human labels: {len(holdout_no_mat_with_human):,} """) if holdout_no_mat_with_human: human_cats_for_flagged = Counter() for pid, text, hlabels in holdout_no_mat_with_human: for hl in hlabels: human_cats_for_flagged[hl["category"]] += 1 print(" HUMAN VALIDATION (flagged holdout paragraphs):") total_h = sum(human_cats_for_flagged.values()) for cat, cnt in human_cats_for_flagged.most_common(): print(f" {cat}: {cnt} ({cnt / total_h * 100:.1f}%)") if __name__ == "__main__": main()