394 lines
21 KiB
Python
394 lines
21 KiB
Python
"""
|
|
Gold Set Adjudication Script (v2)
|
|
==================================
|
|
|
|
Produces gold labels for the 1,200 holdout paragraphs using a tiered adjudication
|
|
strategy that combines 6 human annotators (3 per paragraph via BIBD) + 6 GenAI
|
|
models (MiniMax excluded per documented statistical outlier analysis, z=-2.07).
|
|
|
|
Each paragraph has up to 9 signals: 3 human + 6 model.
|
|
|
|
Tier system:
|
|
T1: Super-consensus — >=8/9 signals agree -> auto-gold (near-unanimous)
|
|
T2: Human majority + model majority agree -> cross-validated gold
|
|
T3: Rule-based override — 27 SI<->N/O paragraphs + 10 codebook tiebreakers,
|
|
each analyzed paragraph-by-paragraph against codebook rules and actual text.
|
|
T4: Model unanimous (6/6) + human majority disagree -> model label.
|
|
T5: Remaining disagreements -> plurality with text-based BG vote removal.
|
|
|
|
v2 changes (experimentally validated, see docs/T5-ANALYSIS.md):
|
|
- 10 new T5 codebook overrides (ID/SI, SPAC, board-removal, committee-level)
|
|
- Text-based BG vote removal: if "board" absent from paragraph text, BG model
|
|
votes are removed before T5 plurality. 13 labels changed, source accuracy UP
|
|
for 10/12 sources (+0.5-1.1% for top sources).
|
|
- Specificity hybrid: human unanimous -> human label, human split -> model majority.
|
|
195 specificity labels updated. Model-model spec agreement is 87-91% vs
|
|
human consensus of 52.5%.
|
|
|
|
Net effect: T5 reduced 92->85 (-7), source accuracy: Opus 88.6->89.1%, GPT-5.4
|
|
87.4->88.5%, gold!=human 151->144. 20 category labels changed, 195 specificity.
|
|
|
|
Usage:
|
|
uv run scripts/adjudicate-gold.py
|
|
"""
|
|
import json
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
# ── SI<->N/O RULE-BASED OVERRIDES ────────────────────────────────────────
|
|
#
|
|
# These 27 paragraphs were analyzed INDIVIDUALLY against codebook rules and
|
|
# actual paragraph text. This is NOT a blanket override -- each paragraph was
|
|
# read, assessed against the assessment-vs-speculation distinction (Rule 6),
|
|
# the cross-reference exception, and the SPAC rule (Case 8).
|
|
#
|
|
# The analysis found that ~20/25 "Human=SI, Model=N/O" cases are human errors:
|
|
# annotators systematically treat ANY mention of "material" + "business strategy"
|
|
# as SI, even when the language is pure "could/if/may" speculation. The codebook's
|
|
# distinction is correct; humans weren't consistently applying it.
|
|
#
|
|
# The 2 "Human=N/O, Model=SI" cases are also human errors: both contain clear
|
|
# negative assertions ("not aware of having experienced any prior material
|
|
# incidents") which are textbook SI per Rule 6.
|
|
#
|
|
# Full analysis: docs/V35-ITERATION-LOG.md "The SI<->N/O Paradox -- Resolved"
|
|
|
|
SI_NO_OVERRIDES: dict[str, tuple[str, str]] = {
|
|
# ── Speculation, not assessment (Human=SI -> N/O) ─────────────────────
|
|
"026c8eca": ("None/Other", "Speculation: 'could potentially result in' -- no materiality assessment"),
|
|
"160fec46": ("None/Other", "Resource lament: 'do not have manpower' -- no materiality assessment"),
|
|
"1f29ea8c": ("None/Other", "Speculation: 'could have material adverse effect' boilerplate"),
|
|
"20c70335": ("None/Other", "Risk list: 'A breach could lead to...' -- enumeration, not assessment"),
|
|
"303685cf": ("None/Other", "Speculation: 'could materially adversely affect'"),
|
|
"7d021fcc": ("None/Other", "Speculation: 'could...have a material adverse effect'"),
|
|
"7ef53cab": ("None/Other", "Risk enumeration: 'could lead to... could disrupt... could steal...'"),
|
|
"a0d01951": ("None/Other", "Speculation: 'could adversely affect our business'"),
|
|
"aaa8974b": ("None/Other", "Speculation: 'could potentially have a material impact' -- Case 9 fix"),
|
|
"b058dca1": ("None/Other", "Speculation: 'could disrupt our operations'"),
|
|
"b1b216b6": ("None/Other", "Speculation: 'could materially adversely affect'"),
|
|
"dc8a2798": ("None/Other", "Speculation: 'If compromised, we could be subject to...'"),
|
|
"e4bd0e2f": ("None/Other", "Speculation: 'could have material adverse impact'"),
|
|
"f4656a7e": ("None/Other", "Threat enumeration under SI-sounding header -- no assessment"),
|
|
# ── Cross-references (Human=SI -> N/O) ────────────────────────────────
|
|
"2e8cbdbf": ("None/Other", "Cross-ref: 'We describe whether and how... under the headings [risk factors]'"),
|
|
"75de7441": ("None/Other", "Cross-ref: 'We describe whether and how... under the heading [risk factor]'"),
|
|
"78cad2a1": ("None/Other", "Cross-ref: 'In our Risk Factors, we describe whether and how...'"),
|
|
"3879887f": ("None/Other", "Brief incident mention + 'See Item 1A' cross-reference"),
|
|
"f026f2be": ("None/Other", "Risk factor heading/cross-reference -- not an assessment"),
|
|
# ── No materiality assessment present (Human=SI -> N/O) ───────────────
|
|
"5df3a6c9": ("None/Other", "IT importance statement -- no assessment. H=1/3 SI"),
|
|
"d5dc17c2": ("None/Other", "Risk enumeration -- no assessment. H=1/3 SI"),
|
|
"c10f2a54": ("None/Other", "Early-stage/SPAC + weak negative assertion. SPAC rule dominates"),
|
|
"45961c99": ("None/Other", "Past disruption but no materiality language. Primarily speculation"),
|
|
"1673f332": ("None/Other", "SPAC with assessment at end -- SPAC rule dominates per Case 8"),
|
|
"f75ac78a": ("Risk Management Process", "Resource expenditure on cybersecurity -- RMP per person-removal test"),
|
|
# ── Negative assertions ARE assessments (Human=N/O -> SI) ─────────────
|
|
"367108c2": ("Strategy Integration", "Negative assertion: 'not aware of having experienced any prior material data breaches'"),
|
|
"837e31d5": ("Strategy Integration", "Negative assertion: 'did not experience any cybersecurity incident during 2024'"),
|
|
}
|
|
|
|
|
|
# ── T5 CODEBOOK RESOLUTIONS ──────────────────────────────────────────────
|
|
#
|
|
# Additional rule-based overrides for T5-plurality cases where codebook
|
|
# tiebreakers clearly resolve the disagreement. Applied AFTER plurality
|
|
# resolution as a correction layer.
|
|
#
|
|
# SI<->ID tiebreaker: "DESCRIBES what happened -> ID; ONLY discusses
|
|
# cost/materiality -> SI; brief mention + materiality conclusion -> SI"
|
|
#
|
|
# TP<->RMP central-topic test: third parties supporting internal
|
|
# program -> RMP; vendor oversight as central topic -> TP
|
|
|
|
T5_CODEBOOK_OVERRIDES: dict[str, tuple[str, str]] = {
|
|
# ── SI<->ID: materiality assessment without incident narrative -> SI ──
|
|
"15e7cf99": ("Strategy Integration", "SI/ID tiebreaker: 'have not encountered any risks' -- materiality assessment, no specific incident described"),
|
|
# ── SI<->ID: specific incident with date -> ID ────────────────────────
|
|
"6dc6bb4a": ("Incident Disclosure", "SI/ID tiebreaker: 'ransomware attack in October 2021' -- describes specific incident with date"),
|
|
# ── TP<->RMP: third parties supporting internal program -> RMP ────────
|
|
"c71739a9": ("Risk Management Process", "TP/RMP: Fund relies on CCO and adviser's risk management expertise -- third parties supporting internal process"),
|
|
# ── ID<->SI: negative assertion = materiality assessment -> SI ────────
|
|
"0ceeb618": ("Strategy Integration", "ID/SI: opens with 'no material incidents', Feb 2025 incident is brief context + 'has not had material impact' conclusion. Materiality assessment frame dominates"),
|
|
"cc82eb9f": ("Strategy Integration", "ID/SI: June 2018 incident is example within broader negative materiality assertion ('have not materially affected us'). Assessment frame dominates"),
|
|
# ── SPAC rule (Case 8): pre-revenue company -> N/O ────────────────────
|
|
"203ccd43": ("None/Other", "SPAC: 'once the Company commences operations' -- pre-revenue company. Case 8: SPAC -> N/O regardless of management role language"),
|
|
# ── ID->RMP: post-incident improvements, no incident described ────────
|
|
"f549fd64": ("Risk Management Process", "ID/RMP: 'Following this cybersecurity event' -- refers to incident without describing it. 100% of content is hardening, training, MFA, EDR -- pure RMP"),
|
|
# ── Board-removal test: BG override where board mention is incidental ──
|
|
"22da6695": ("Risk Management Process", "Board-removal: 'Board is also responsible' (1 sentence) + 'notifying the Board' (final clause). Remove -> CISO + IS Program + incident response plan. Process dominates"),
|
|
"a2ff7e1e": ("Management Role", "Committee-level: Compliance Committee is management-level (O'Reilly executives). Board is incidental destination (2 clauses). Titled 'Management's Role'"),
|
|
"cb518f47": ("Management Role", "Board-removal: remove notification sentence -> 'management oversees cybersecurity.' Board is incident notification destination only"),
|
|
}
|
|
|
|
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|
with open(path) as f:
|
|
return [json.loads(line) for line in f]
|
|
|
|
|
|
def load_paragraph_texts() -> dict[str, str]:
|
|
"""Load holdout paragraph texts for text-based adjudication rules."""
|
|
return {r["id"]: r["text"] for r in load_jsonl(ROOT / "data/gold/paragraphs-holdout.jsonl")}
|
|
|
|
|
|
def majority_vote(votes: list[str]) -> str | None:
|
|
if not votes:
|
|
return None
|
|
return Counter(votes).most_common(1)[0][0]
|
|
|
|
|
|
def main() -> None:
|
|
# ── Load data ─────────────────────────────────────────────────────────
|
|
|
|
human_labels: dict[str, list[dict]] = defaultdict(list)
|
|
for r in load_jsonl(ROOT / "data/gold/human-labels-raw.jsonl"):
|
|
human_labels[r["paragraphId"]].append({
|
|
"cat": r["contentCategory"],
|
|
"spec": r["specificityLevel"],
|
|
"annotator": r["annotatorName"],
|
|
})
|
|
|
|
confusion_pids = {r["paragraphId"] for r in load_jsonl(ROOT / "data/gold/holdout-rerun-v35.jsonl")}
|
|
|
|
TOP6 = ["Opus", "GPT-5.4", "Gemini", "GLM-5", "Kimi", "MIMO"]
|
|
|
|
def load_model_cats(files: dict[str, Path]) -> dict[str, dict[str, str]]:
|
|
result: dict[str, dict[str, str]] = {}
|
|
for name, path in files.items():
|
|
result[name] = {}
|
|
if path.exists():
|
|
for r in load_jsonl(path):
|
|
cat = r.get("label", {}).get("content_category") or r.get("content_category")
|
|
if cat:
|
|
result[name][r["paragraphId"]] = cat
|
|
return result
|
|
|
|
v30_cats = load_model_cats({
|
|
"Opus": ROOT / "data/annotations/golden/opus.jsonl",
|
|
"GPT-5.4": ROOT / "data/annotations/bench-holdout/gpt-5.4.jsonl",
|
|
"Gemini": ROOT / "data/annotations/bench-holdout/gemini-3.1-pro-preview.jsonl",
|
|
"GLM-5": ROOT / "data/annotations/bench-holdout/glm-5:exacto.jsonl",
|
|
"Kimi": ROOT / "data/annotations/bench-holdout/kimi-k2.5.jsonl",
|
|
"MIMO": ROOT / "data/annotations/bench-holdout/mimo-v2-pro:exacto.jsonl",
|
|
})
|
|
|
|
v35_cats = load_model_cats({
|
|
"Opus": ROOT / "data/annotations/golden-v35/opus.jsonl",
|
|
"GPT-5.4": ROOT / "data/annotations/bench-holdout-v35/gpt-5.4.jsonl",
|
|
"Gemini": ROOT / "data/annotations/bench-holdout-v35/gemini-3.1-pro-preview.jsonl",
|
|
"GLM-5": ROOT / "data/annotations/bench-holdout-v35/glm-5:exacto.jsonl",
|
|
"Kimi": ROOT / "data/annotations/bench-holdout-v35/kimi-k2.5.jsonl",
|
|
"MIMO": ROOT / "data/annotations/bench-holdout-v35/mimo-v2-pro:exacto.jsonl",
|
|
})
|
|
|
|
# Use v3.5 labels for confusion-axis PIDs (codebook-corrected), v3.0 for rest
|
|
model_cats: dict[str, dict[str, str]] = {}
|
|
for m in TOP6:
|
|
model_cats[m] = {}
|
|
for pid in human_labels:
|
|
if pid in confusion_pids and pid in v35_cats.get(m, {}):
|
|
model_cats[m][pid] = v35_cats[m][pid]
|
|
elif pid in v30_cats.get(m, {}):
|
|
model_cats[m][pid] = v30_cats[m][pid]
|
|
|
|
# Load model specificity for hybrid specificity (v3.0 for full coverage)
|
|
def load_model_specs(files: dict[str, Path]) -> dict[str, dict[str, int]]:
|
|
result: dict[str, dict[str, int]] = {}
|
|
for name, path in files.items():
|
|
result[name] = {}
|
|
if path.exists():
|
|
for r in load_jsonl(path):
|
|
spec = r.get("label", {}).get("specificity_level") or r.get("specificity_level")
|
|
if spec is not None:
|
|
result[name][r["paragraphId"]] = spec
|
|
return result
|
|
|
|
model_specs = load_model_specs({
|
|
"Opus": ROOT / "data/annotations/golden/opus.jsonl",
|
|
"GPT-5.4": ROOT / "data/annotations/bench-holdout/gpt-5.4.jsonl",
|
|
"Gemini": ROOT / "data/annotations/bench-holdout/gemini-3.1-pro-preview.jsonl",
|
|
"GLM-5": ROOT / "data/annotations/bench-holdout/glm-5:exacto.jsonl",
|
|
"Kimi": ROOT / "data/annotations/bench-holdout/kimi-k2.5.jsonl",
|
|
"MIMO": ROOT / "data/annotations/bench-holdout/mimo-v2-pro:exacto.jsonl",
|
|
})
|
|
|
|
# Load paragraph texts for text-based adjudication rules
|
|
para_texts = load_paragraph_texts()
|
|
|
|
# ── Adjudicate ────────────────────────────────────────────────────────
|
|
|
|
results: list[dict] = []
|
|
tier_counts: Counter[str] = Counter()
|
|
|
|
for pid in sorted(human_labels.keys()):
|
|
h_cats = [l["cat"] for l in human_labels[pid]]
|
|
h_specs = [l["spec"] for l in human_labels[pid]]
|
|
h_cat_maj = majority_vote(h_cats)
|
|
h_spec_maj = majority_vote(h_specs)
|
|
h_cat_unanimous = len(set(h_cats)) == 1
|
|
|
|
m_cats_list = [model_cats[m][pid] for m in TOP6 if pid in model_cats[m]]
|
|
m_cat_maj = majority_vote(m_cats_list)
|
|
m_cat_unanimous = len(set(m_cats_list)) == 1 and len(m_cats_list) == 6
|
|
|
|
all_signals = h_cats + m_cats_list
|
|
signal_counter = Counter(all_signals)
|
|
total_signals = len(all_signals)
|
|
top_signal, top_count = signal_counter.most_common(1)[0]
|
|
|
|
short_pid = pid[:8]
|
|
si_override = SI_NO_OVERRIDES.get(short_pid)
|
|
|
|
gold_cat: str | None = None
|
|
tier: str = ""
|
|
reason: str = ""
|
|
|
|
if si_override:
|
|
gold_cat = si_override[0]
|
|
tier = "T3-rule"
|
|
reason = f"SI/NO override: {si_override[1]}"
|
|
elif top_count >= 8 and total_signals >= 8:
|
|
gold_cat = top_signal
|
|
tier = "T1-super"
|
|
reason = f"{top_count}/{total_signals} signals agree"
|
|
elif h_cat_maj == m_cat_maj:
|
|
gold_cat = h_cat_maj
|
|
tier = "T2-cross"
|
|
reason = "Human + model majority agree"
|
|
elif m_cat_unanimous:
|
|
# All 6 models unanimous. Whether humans are split (2/3) or unanimous (3/3),
|
|
# trust models on documented systematic error axes. Cross-axis analysis shows:
|
|
# - MR->RMP: models apply person-removal test correctly (humans 91% one-directional)
|
|
# - MR->BG: models apply purpose test correctly (humans 97% one-directional)
|
|
# - RMP->BG: models identify governance purpose (humans 78% one-directional)
|
|
# - TP->RMP: models apply central-topic test (humans 92% one-directional)
|
|
# - SI->N/O: models apply assessment-vs-speculation (humans 93% one-directional)
|
|
# All 9 T5-conflict cases (both sides unanimous) verified: models correct on every one.
|
|
gold_cat = m_cat_maj
|
|
tier = "T4-model"
|
|
h_count = Counter(h_cats).most_common(1)[0][1]
|
|
reason = f"6/6 models unanimous ({m_cat_maj}) vs human {h_count}/3 ({h_cat_maj})"
|
|
else:
|
|
# Check T5 codebook overrides before falling back to plurality
|
|
t5_override = T5_CODEBOOK_OVERRIDES.get(short_pid)
|
|
if t5_override:
|
|
gold_cat = t5_override[0]
|
|
tier = "T3-rule"
|
|
reason = f"T5 codebook override: {t5_override[1]}"
|
|
else:
|
|
# ── No-board BG vote removal ──────────────────────────
|
|
# If "board" (case-insensitive) doesn't appear in the paragraph
|
|
# text, BG model votes are provably unsupported — the paragraph
|
|
# can't be about board governance if it never mentions the board.
|
|
# Remove those BG signals and recalculate plurality.
|
|
# Validated experimentally: 13 labels changed, source accuracy
|
|
# UP for 10/12 sources (+0.5-0.8% for top annotators/models).
|
|
t5_signals = list(all_signals)
|
|
para_text = para_texts.get(pid, "")
|
|
if "board" not in para_text.lower():
|
|
bg_count = sum(1 for s in t5_signals if s == "Board Governance")
|
|
if bg_count > 0:
|
|
t5_signals = [s for s in t5_signals if s != "Board Governance"]
|
|
|
|
if t5_signals:
|
|
t5_counter = Counter(t5_signals)
|
|
t5_top, t5_top_count = t5_counter.most_common(1)[0]
|
|
t5_total = len(t5_signals)
|
|
else:
|
|
t5_top, t5_top_count, t5_total = top_signal, top_count, total_signals
|
|
|
|
gold_cat = t5_top
|
|
tier = "T5-plurality"
|
|
reason = f"Mixed: human={h_cat_maj}, model={m_cat_maj}, plurality={t5_top} ({t5_top_count}/{t5_total})"
|
|
|
|
# ── Specificity: hybrid human/model ──────────────────────────
|
|
# Human consensus on specificity is only 52.5%, while model-model
|
|
# agreement is 87-91%. When humans are unanimous (3/3), trust their
|
|
# label. When humans split, use model majority (more reliable).
|
|
h_spec_unanimous = len(set(h_specs)) == 1
|
|
if h_spec_unanimous:
|
|
gold_spec = h_spec_maj
|
|
else:
|
|
m_specs = [model_specs[m][pid] for m in TOP6 if pid in model_specs[m]]
|
|
if m_specs:
|
|
gold_spec = int(majority_vote([str(s) for s in m_specs]) or h_spec_maj)
|
|
else:
|
|
gold_spec = h_spec_maj
|
|
|
|
tier_counts[tier] += 1
|
|
results.append({
|
|
"paragraphId": pid,
|
|
"gold_category": gold_cat,
|
|
"gold_specificity": gold_spec,
|
|
"tier": tier,
|
|
"reason": reason,
|
|
"human_majority": h_cat_maj,
|
|
"model_majority": m_cat_maj,
|
|
"human_votes": dict(Counter(h_cats)),
|
|
"model_votes": dict(Counter(m_cats_list)),
|
|
})
|
|
|
|
# ── Write output ──────────────────────────────────────────────────────
|
|
|
|
output_path = ROOT / "data/gold/gold-adjudicated.jsonl"
|
|
with open(output_path, "w") as f:
|
|
for r in results:
|
|
f.write(json.dumps(r) + "\n")
|
|
|
|
# ── Summary ───────────────────────────────────────────────────────────
|
|
|
|
print("=" * 90)
|
|
print("GOLD SET ADJUDICATION SUMMARY")
|
|
print("=" * 90)
|
|
print(f"\nTotal paragraphs: {len(results)}")
|
|
print(f"\nTier breakdown:")
|
|
for tier, count in sorted(tier_counts.items()):
|
|
pct = count / len(results) * 100
|
|
print(f" {tier:<16} {count:>5} ({pct:.1f}%)")
|
|
|
|
flipped = sum(1 for r in results if r["gold_category"] != r["human_majority"])
|
|
print(f"\nGold labels differing from human majority: {flipped} ({flipped / len(results):.1%})")
|
|
|
|
print(f"\nCategory distribution:")
|
|
h_dist = Counter(r["human_majority"] for r in results)
|
|
g_dist = Counter(r["gold_category"] for r in results)
|
|
print(f" {'Category':<25} {'Human Maj':>10} {'Gold':>10} {'Delta':>6}")
|
|
for cat in sorted(set(list(h_dist.keys()) + list(g_dist.keys()))):
|
|
print(f" {cat:<25} {h_dist.get(cat, 0):>10} {g_dist.get(cat, 0):>10} {g_dist.get(cat, 0) - h_dist.get(cat, 0):>+6}")
|
|
|
|
gold_by_pid = {r["paragraphId"]: r["gold_category"] for r in results}
|
|
|
|
print(f"\n{'=' * 90}")
|
|
print("SOURCE ACCURACY vs ADJUDICATED GOLD")
|
|
print(f"{'=' * 90}")
|
|
|
|
annotator_names = sorted(set(l["annotator"] for labels in human_labels.values() for l in labels))
|
|
print("\nHuman annotators:")
|
|
for ann in annotator_names:
|
|
agree = total = 0
|
|
for pid, labels in human_labels.items():
|
|
for l in labels:
|
|
if l["annotator"] == ann and pid in gold_by_pid:
|
|
total += 1
|
|
if l["cat"] == gold_by_pid[pid]:
|
|
agree += 1
|
|
print(f" {ann:<12} {agree}/{total} ({agree / total:.1%})")
|
|
|
|
print("\nModels (v3.0 on full 1200):")
|
|
for m in TOP6:
|
|
agree = total = 0
|
|
for pid in gold_by_pid:
|
|
if pid in v30_cats.get(m, {}):
|
|
total += 1
|
|
if v30_cats[m][pid] == gold_by_pid[pid]:
|
|
agree += 1
|
|
print(f" {m:<12} {agree}/{total} ({agree / total:.1%})")
|
|
|
|
print(f"\nOutput: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|