From c9497f57092122bddc1cf3263a1a8d9bff106575 Mon Sep 17 00:00:00 2001
From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com>
Date: Thu, 2 Apr 2026 02:02:36 -0400
Subject: [PATCH] 6 model panel benchmark

---
 docs/NARRATIVE.md          |  87 +++++++++++++++++++-----
 docs/POST-LABELING-PLAN.md | 131 ++++++++++++++++++-------------------
 docs/STATUS.md             | 128 ++++++++++++++++++++++++------------
 ts/src/label/annotate.ts   |  46 +++++++++----
 ts/src/label/prompts.ts    |   9 ++-
 ts/src/lib/openrouter.ts   |   8 +--
 6 files changed, 263 insertions(+), 146 deletions(-)

diff --git a/docs/NARRATIVE.md b/docs/NARRATIVE.md
index f02f261..0b42800 100644
--- a/docs/NARRATIVE.md
+++ b/docs/NARRATIVE.md
@@ -869,17 +869,19 @@ Only nano's portion ($21.24) of the first run was wasted — the gemini and grok
 | Labelapp build + infrastructure | ~8h | Monorepo restructure, Next.js app, quiz/warmup/labeling flows, BIBD assignment, sampling, Docker deployment, timer + migration infrastructure |
 | DAPT pre-training | ~14.5h GPU | 1 epoch on 500M tokens, RTX 3090. Two sessions (resumed from checkpoint-1280). |
 | TAPT debugging + pre-training | ~2h dev + ~50min GPU | 4 bugs in transformers whole-word masking + Python 3.14 rollback. Training: 5 epochs on 72K paragraphs, 50 min. |
-| **Total to date** | **~52h** | Includes ~15.3h GPU time |
+| Human labeling (1,200 paragraphs, 6 annotators) | 21.5h active | $0 (team labor) |
+| Post-labeling analysis + gold set tooling | ~3h | $0 |
+| **Total to date** | **~76.5h** | Includes ~15.3h GPU + 21.5h human labeling |
 
 ### Remaining Work (estimated)
 
 | Phase | Est. Hours | Est. Cost |
 |-------|-----------|-----------|
-| Human labeling (1,200 paragraphs, 6 annotators) | ~6-8h | $0 (team labor) |
-| Stage 2 judge production run (~3-5K paragraphs) | ~1h | ~$20-40 |
+| GenAI holdout benchmark (6 models × 1,200) | ~1h | ~$15-43 |
+| Opus golden re-run (1,200 paragraphs) | ~1h | $0 (subscription) |
+| Gold set adjudication (13+ signals/paragraph) | ~4h | $0 |
 | Training data assembly | ~2h | $0 |
 | Fine-tuning + ablations (7 experiments) | ~12-20h GPU | $0 |
-| Full GenAI benchmark on 1,200 holdout (9 models) | ~1h | ~$30-50 |
 | Evaluation + comparison + write-up | ~6-8h | $0 |
 
 ---
@@ -957,27 +959,71 @@ Three models from three providers — minimizes correlated errors.
 | xiaomi/mimo-v2-pro | Xiaomi | structured | Tested | — | Benchmarked |
 | moonshotai/kimi-k2.5 | Moonshot AI | structured | Tested | — | Only 26/50 completed — high failure rate |
 
-### Summary: 18 Models, 10 Providers
+### Phase 6: Holdout Benchmark — 6 models from 6 suppliers
 
-| Provider | Models Tested | Models in Production |
-|----------|--------------|---------------------|
-| Google | gemini-3.1-flash-lite, gemini-3.1-flash, gemini-3-flash | gemini-3.1-flash-lite (Stage 1) |
-| OpenAI | gpt-oss-120b, gpt-5.4-nano, gpt-4.1-mini, gpt-4.1-nano, gpt-5.4-mini, gpt-5.4 | — (nano dropped) |
-| xAI | grok-4.1-fast, grok-4.20-beta | grok-4.1-fast (Stage 1) |
-| Xiaomi | mimo-v2-flash, mimo-v2-pro | mimo-v2-flash (Stage 1) |
-| Anthropic | claude-haiku-4.5, claude-sonnet-4.6 | sonnet-4.6 (gold labels) |
-| Zhipu AI | glm-4.5-air, glm-5 | TBD (Stage 2 judge) |
+After human labeling completion, 6 models benchmark against the 1,200 holdout with v3.0 prompt:
+
+| Model | Provider | Cost/call | Latency | Notes |
+|-------|----------|-----------|---------|-------|
+| openai/gpt-5.4 | OpenAI | $0.009 | 5s | |
+| moonshotai/kimi-k2.5 | Moonshot | $0.006 | 33s | |
+| google/gemini-3.1-pro-preview | Google | $0.006 | 3s | |
+| z-ai/glm-5 | Zhipu | $0.006 | ~40s | exacto routing |
+| minimax/minimax-m2.7 | MiniMax | $0.002 | 11s | Raw text mode (markdown fences) |
+| xiaomi/mimo-v2-pro | Xiaomi | $0.006 | 32s | exacto routing |
+
+Plus Opus 4.6 (Anthropic) via Agent SDK on all 1,200 holdout paragraphs.
+
+### Summary: 21+ Models, 12 Providers
+
+| Provider | Models Tested | Role |
+|----------|--------------|------|
+| Google | gemini-3.1-flash-lite, gemini-3.1-flash, gemini-3-flash, gemini-3.1-pro | Stage 1 panel + benchmark |
+| OpenAI | gpt-oss-120b, gpt-5.4-nano, gpt-4.1-mini, gpt-4.1-nano, gpt-5.4-mini, gpt-5.4 | Benchmark |
+| xAI | grok-4.1-fast, grok-4.20-beta | Stage 1 panel |
+| Xiaomi | mimo-v2-flash, mimo-v2-pro | Stage 1 panel + benchmark |
+| Anthropic | claude-haiku-4.5, claude-sonnet-4.6, claude-opus-4.6 | Gold labels (Opus), judge |
+| Zhipu AI | glm-4.5-air, glm-5 | Benchmark |
+| MiniMax | minimax-m2.5, minimax-m2.7 | Benchmark |
+| Moonshot AI | kimi-k2.5 | Benchmark |
 | ByteDance | seed-2.0-lite | — (too expensive for scale) |
 | NVIDIA | nemotron-3-super-120b | — (worst performer) |
 | Mistral | mistral-small-2603 | — (zero reasoning) |
-| MiniMax | minimax-m2.5 | — (mediocre) |
-| Moonshot AI | kimi-k2.5 | — (high failure rate) |
 | Meta | llama-4-maverick | — (smoke test only) |
 | Alibaba | qwen3-235b-a22b | — (smoke test only) |
 | DeepSeek | deepseek-chat-v3-0324 | — (smoke test only) |
 
 ---
 
+## Phase 13: GenAI Holdout Benchmark
+
+### Benchmark Panel
+
+With human labeling complete, the next step is running 6+ GenAI models from 3+ suppliers on the same 1,200 holdout paragraphs — both as an assignment requirement and to generate the 13+ annotation signals needed for gold set adjudication.
+
+The benchmark panel uses the v3.0 prompt (with codebook rulings) and runs via OpenRouter:
+
+| Model | Supplier | Cost/call | Latency | Structured Output |
+|-------|----------|-----------|---------|-------------------|
+| openai/gpt-5.4 | OpenAI | $0.009 | 5s | Native |
+| moonshotai/kimi-k2.5 | Moonshot | $0.006 | 33s | Native |
+| google/gemini-3.1-pro-preview | Google | $0.006 | 3s | Native |
+| z-ai/glm-5 | Zhipu | $0.006 | ~40s | Native (exacto routing) |
+| minimax/minimax-m2.7 | MiniMax | $0.002 | 11s | Raw text + fence stripping |
+| xiaomi/mimo-v2-pro | Xiaomi | $0.006 | 32s | Native (exacto routing) |
+
+Plus Claude Opus 4.6 via Agent SDK (subscription, no per-call cost) with full codebook as system prompt.
+
+Combined with the 3 Stage 1 models already on file: **10 models from 8 suppliers**.
+
+**Minimax structured output workaround:** MiniMax m2.7 wraps JSON responses in markdown code fences (` ```json ... ``` `), which the Vercel AI SDK's `Output.object()` parser cannot handle. Rather than using tool calling (which drops accuracy ~7pp based on GLM-5 testing) or a fallback retry (2x cost), minimax models skip structured output entirely and use raw text generation with regex fence stripping before Zod validation. The enum values are correct with the full v3.0 prompt; only the fences are the issue.
+
+### Opus Golden Re-Run
+
+The Opus golden labeling was re-run on the correct 1,200 holdout paragraphs. A previous run had annotated a different set of 1,200 paragraphs due to `.sampled-ids.json` being overwritten (previous labels preserved at `data/annotations/golden/opus.wrong-sample.jsonl`). The re-run uses parallelized Agent SDK workers (configurable concurrency) with serialized file writes for crash safety.
+
+---
+
 ## Key Technical Artifacts
 
 | Artifact | Location | Description |
@@ -987,7 +1033,12 @@ Three models from three providers — minimizes correlated errors.
 | Paragraphs | `data/paragraphs/paragraphs-clean.jsonl` | 72,045 paragraphs with filing metadata |
 | Gold labels | `data/bench/judges/gold-final.json` | 50 adjudicated gold labels |
 | Gold adjudications | `data/bench/judges/gold-adjudicated.json` | 11 detailed adjudication decisions with reasoning |
-| Stage 1 prompt | `ts/src/label/prompts.ts` | SYSTEM_PROMPT (v2.5) + buildJudgePrompt() |
+| Human labels (raw) | `data/gold/human-labels-raw.jsonl` | 3,600 labels with timing, notes, session IDs |
+| Human label metrics | `data/gold/metrics.json` | Full IRR: per-dimension alpha, pairwise kappa matrices, per-category/stratum rates |
+| Holdout paragraphs | `data/gold/paragraphs-holdout.jsonl` | 1,200 holdout paragraphs with Stage 1 consensus metadata |
+| Diagnostic charts | `data/gold/charts/` | 16 analysis charts (kappa heatmaps, confusion matrices, distributions, etc.) |
+| Analysis script | `scripts/analyze-gold.py` | Comprehensive cross-source analysis (human × Stage 1 × Opus) |
+| Annotation prompt | `ts/src/label/prompts.ts` | SYSTEM_PROMPT (v3.0) + buildJudgePrompt() |
 | Annotation runner | `ts/scripts/stage1-run.ts` | Resume-safe, configurable concurrency |
 | Orphan re-annotation | `ts/scripts/rerun-orphan-stage1.ts` | Re-ran 1,537 patched paragraphs, $3.30 |
 | Re-annotation diff | `ts/scripts/diff-orphan-annotations.ts` | Category/specificity change analysis |
@@ -999,7 +1050,9 @@ Three models from three providers — minimizes correlated errors.
 | Judge diagnostics | `ts/scripts/judge-diag.ts`, `judge-diag-batch.ts` | GLM-5 failure investigation |
 | Model benchmarking | `ts/scripts/model-bench.ts` | Stage 1 candidate evaluation |
 | Golden annotation (Opus) | `ts/src/label/golden.ts` | Agent SDK runner for gold set, saves reasoning traces |
-| Golden annotations | `data/annotations/golden/opus.jsonl` | Opus 4.6 labels + thinking + raw confidence |
+| Golden annotations | `data/annotations/golden/opus.jsonl` | Opus 4.6 labels + thinking + raw confidence (re-run on correct holdout) |
+| Benchmark annotations | `data/annotations/bench-holdout/{model}.jsonl` | 6 models × 1,200 paragraphs, v3.0 prompt |
+| Stale golden (wrong sample) | `data/annotations/golden/opus.wrong-sample.jsonl` | Original Opus run on wrong 1,200 paragraphs (preserved) |
 
 ---
 
diff --git a/docs/POST-LABELING-PLAN.md b/docs/POST-LABELING-PLAN.md
index 3a8ec0b..5ae7ba0 100644
--- a/docs/POST-LABELING-PLAN.md
+++ b/docs/POST-LABELING-PLAN.md
@@ -1,111 +1,104 @@
 # Post-Labeling Plan — Gold Set Repair & Final Pipeline
 
-Written 2026-04-01 while waiting for the last human annotator to finish.
+Updated 2026-04-02 with actual human labeling results.
 
 ---
 
-## The Situation
+## Human Labeling Results
 
-Human labeling is nearly complete (1,200 paragraphs, 6 annotators, 3 per paragraph via BIBD). Current inter-annotator agreement:
-- **Cohen's Kappa (avg):** 0.622
-- **Krippendorff's alpha:** 0.616
+Completed 2026-04-01. 3,600 labels (1,200 paragraphs × 3 annotators via BIBD), 21.5 active hours total.
 
-These numbers are at the floor of "substantial agreement" (Landis & Koch) but below the 0.667 threshold Krippendorff recommends for tentative conclusions. The holdout was deliberately stratified to over-sample hard cases (120 Management↔RMP splits, 80 None/Other↔Strategy splits, 80 Spec [3,4] splits, etc.), so raw consensus reflects sampling difficulty, not pure annotator quality.
+### Per-Dimension Agreement
 
-The task is genuinely hard: 7 categories, 4 specificity levels, 5 decision rules, 3 codebook rulings, multi-step reasoning required (person-vs-function test, QV fact counting). The GenAI panel struggled with the same boundaries.
+| Metric | Category | Specificity | Both |
+|--------|----------|-------------|------|
+| Consensus (3/3 agree) | 56.8% | 42.3% | 27.0% |
+| Krippendorff's α | **0.801** | 0.546 | — |
+| Avg Cohen's κ | 0.612 | 0.440 | — |
 
----
+**Category is reliable.** α = 0.801 exceeds the 0.80 conventional threshold. Human majority matches Stage 1 GenAI majority on 83.3% of paragraphs for category.
 
-## Immediate Analysis (once last annotator finishes)
+**Specificity is unreliable.** α = 0.546 is well below the 0.667 threshold. Driven by two factors: one outlier annotator and a genuinely hard Spec 3↔4 boundary.
 
-1. **Export labels** from labelapp (`bun run la:export`)
-2. **Per-dimension alpha:** Compute Krippendorff's alpha for category and specificity separately. Hypothesis: category alpha is significantly higher than specificity alpha (matching the GenAI pattern where Spec 4 was only 37.6% unanimous).
-3. **Pairwise Kappa matrix:** All 15 annotator pairs. Identify if one annotator is a systematic outlier or if disagreement is uniform.
-4. **Stratum-level agreement:** Break down consensus rates by sampling stratum (Management↔RMP, None/Other↔Strategy, Spec [3,4], proportional random, etc.). The hard strata should show lower agreement; the proportional random stratum should be higher.
+### The Aaryan Problem
+
+One annotator (Aaryan) is a systematic outlier:
+- Labels 67% of paragraphs as Spec 4 (Quantified-Verifiable) — others: 8-23%, Stage 1: 9%
+- Specificity bias: +1.28 levels vs Stage 1 (massive over-rater)
+- Specificity κ: 0.03-0.25 (essentially chance)
+- Category κ: 0.40-0.50 (below "moderate")
+- Only 3 quiz attempts (lowest; others: 6-11)
+
+Excluding his label on his 600 paragraphs: both-unanimous jumps from 5% → 50% (+45pp).
+
+### Confusion Axes (Human vs GenAI — Same Order)
+
+1. Management Role ↔ Risk Management Process (dominant)
+2. Board Governance ↔ Management Role
+3. None/Other ↔ Strategy Integration (materiality disclaimers)
+
+The same axes, in the same order, for both humans and the GenAI panel. The codebook boundaries drive disagreement, not annotator or model limitations.
 
 ---
 
 ## The Adverse Incentive Problem
 
-The assignment requires F1 > 0.80 on the holdout to pass. This creates a perverse incentive: pick easy, unambiguous paragraphs for the holdout → high human agreement, high GenAI scores, high fine-tuned model F1 → passing grade, meaningless evaluation.
+The assignment requires F1 > 0.80 on the holdout to pass. The holdout was deliberately stratified to over-sample hard decision boundaries (120 MR↔RMP, 80 N/O↔SI, 80 Spec [3,4] splits, etc.).
 
-We did the opposite: stratified to stress-test decision boundaries. This produces a harder holdout with lower headline numbers but an actually informative evaluation.
-
-**Mitigation:** Report F1 on both the full 1,200 holdout AND the 720-paragraph "proportional stratified random" subsample separately. The proportional subsample approximates what a random holdout would look like. The delta between the two quantifies exactly how much performance degrades at decision boundaries. This isn't gaming — it's rigorous reporting.
-
-The A-grade criteria ("error analysis," "comparison to amateur labels") are directly served by our approach. The low human agreement rate is a finding, not a failure.
+**Mitigation:** Report F1 on both the full 1,200 holdout AND the 720-paragraph proportional subsample. The delta quantifies performance degradation at decision boundaries. The stratified design directly serves the A-grade "error analysis" criterion.
 
 ---
 
-## Gold Set Repair Strategy: 13+ Signals Per Paragraph
+## Gold Set Repair Strategy: 13 Signals Per Paragraph
 
-### Existing signals (7 per paragraph)
-- 3 human labels (from labelapp, with notes and timing)
-- 3 Stage 1 GenAI labels (gemini-flash-lite, mimo-v2-flash, grok-4.1-fast)
-- 1 Opus golden label (with full reasoning trace)
+### Annotation sources per paragraph
 
-### New signals from GenAI benchmark (6+ additional)
-The assignment requires benchmarking 6+ models from 3+ suppliers against the holdout. This serves triple duty:
-1. Assignment deliverable (GenAI benchmark table)
-2. Gold set repair evidence (6+ more annotation signals for adjudication)
-3. "GenAI vs amateur" comparison (A-grade criterion)
-
-**Candidate models (6+ from 3+ suppliers):**
-- OpenAI: gpt-5.4-mini, gpt-5.4
-- Google: gemini-3-flash, gemini-3-pro (or similar)
-- Anthropic: claude-sonnet-4.6, claude-haiku-4.5
-- xAI: grok-4.20 (or similar)
-- Others as needed for count
-
-After the benchmark, each paragraph has **13+ independent annotations**. This is an absurdly rich signal for adjudication.
+| Source | Count | Prompt | Notes |
+|--------|-------|--------|-------|
+| Human annotators | 3 | Codebook v3.0 | With notes, timing data |
+| Stage 1 panel (gemini-flash-lite, mimo-flash, grok-fast) | 3 | v2.5 | Already on file |
+| Opus 4.6 golden | 1 | v2.5 + full codebook | With reasoning traces |
+| Benchmark models (gpt-5.4, kimi-k2.5, gemini-pro, glm-5, minimax-m2.7, mimo-pro) | 6 | v3.0 | Running now |
+| **Total** | **13** | | |
 
 ### Adjudication tiers
 
-**Tier 1 — High confidence:** 10+/13 annotators agree on both dimensions. Gold label, no intervention needed. Expected: ~500-600 paragraphs.
+**Tier 1 — High confidence:** 10+/13 agree on both dimensions. Gold label, no intervention.
 
-**Tier 2 — Clear majority with cross-validation:** Human majority exists (2/3) and matches GenAI consensus (majority of 10 GenAI labels). Strong signal — take the consensus. Expected: ~300-400 paragraphs.
+**Tier 2 — Clear majority with cross-validation:** Human majority (2/3) matches GenAI consensus (majority of 10 GenAI labels). Take the consensus.
 
-**Tier 3 — Human split, GenAI consensus:** Humans disagree but GenAI labels converge. Use Opus reasoning trace + GenAI consensus to inform expert adjudication. Human (Joey) makes the final call. Expected: ~100-200 paragraphs.
+**Tier 3 — Human split, GenAI consensus:** Humans disagree but GenAI labels converge. Expert adjudication informed by Opus reasoning traces. Human makes the final call.
 
-**Tier 4 — Universal disagreement:** Humans and GenAI both split. Genuinely ambiguous. Expert adjudication with documented reasoning, or flag as inherently ambiguous and report in error analysis. Expected: ~50-100 paragraphs.
+**Tier 4 — Universal disagreement:** Everyone splits. Expert adjudication with documented reasoning, or flagged as inherently ambiguous for error analysis.
 
-The GenAI labels are evidence for adjudication, not the gold label itself. The final label is always a human decision. This avoids circularity — we're not evaluating GenAI against GenAI-derived labels. We're using GenAI agreement patterns to identify which human label is most likely correct in cases of human disagreement.
-
-If we can't produce reliable gold labels from 13+ signals per paragraph, the construct itself is ill-defined. That would be an important finding too — but given that the GenAI panel achieved 70.8% both-unanimous on 50K paragraphs (unstratified), and the hardest axes have clear codebook resolutions, the construct should hold.
+GenAI labels are evidence for adjudication, not the gold label itself. The final label is always a human decision — this avoids circularity.
 
 ---
 
-## The Meta-Narrative
+## Task Sequence
 
-The finding that trained student annotators achieve α = 0.616 while calibrated LLM panels achieve 70.8%+ unanimity on the same task validates the synthetic experts hypothesis. For complex, rule-heavy classification tasks requiring multi-step reasoning, LLMs with reasoning tokens can match or exceed human annotation quality.
+### In progress
+- [x] Human labeling — complete
+- [x] Data export and IRR analysis — complete
+- [x] Prompt v3.0 update with codebook rulings — complete
+- [x] GenAI benchmark infrastructure — complete
+- [ ] Opus golden re-run on correct holdout (running, ~1h with 20 workers)
+- [ ] 6-model benchmark on holdout (running, high concurrency)
 
-This isn't a failure of the humans — it's the whole point of the project. The Ringel pipeline exists because these tasks are too cognitively demanding for consistent human annotation at scale. The human labels are essential as a calibration anchor, but GenAI's advantage on rule-application tasks is a key finding.
-
----
-
-## Task Sequence (dependency order)
-
-### Can start now (no blockers)
-- [ ] Judge prompt v3.0 update (codebook rulings → `buildJudgePrompt()`)
-- [ ] Fine-tuning pipeline code (dual-head classifier, sample weighting, train/val/test split)
-- [ ] GenAI benchmark infrastructure (scripts to run 6+ models on holdout)
-
-### After last annotator finishes
-- [ ] Export + per-dimension alpha + pairwise Kappa matrix + stratum breakdown
-- [ ] Run GenAI benchmark on 1,200 holdout (6+ models, 3+ suppliers)
-- [ ] Gold set adjudication using 13+ signals per paragraph
-- [ ] Judge v3.0 validation against adjudicated gold set
+### After benchmark completes
+- [ ] Cross-source analysis with all 13 signals (update `analyze-gold.py`)
+- [ ] Gold set adjudication using tiered strategy
+- [ ] Training data assembly (unanimous + calibrated majority + judge)
 
 ### After gold set is finalized
-- [ ] Training data assembly (unanimous + calibrated majority + judge)
-- [ ] Fine-tuning + ablations (7 experiments)
+- [ ] Fine-tuning + ablations (7 experiments: {base, +DAPT, +DAPT+TAPT} × {±SCL} + best)
 - [ ] Final evaluation on holdout
 - [ ] Writeup + IGNITE slides
 
 ---
 
-## Open Questions
+## The Meta-Narrative
 
-1. **F1 threshold per-dimension?** Worth asking Ringel if the 0.80 F1 requirement applies to the joint 28-class label or can be reported per-dimension (category + specificity separately).
-2. **Soft labels for ambiguous cases?** For Tier 4 paragraphs, could use label distributions as soft targets during training instead of forcing a hard label. More sophisticated but harder to evaluate.
-3. **One bad annotator vs. uniform disagreement?** The pairwise Kappa matrix will answer this. If one annotator is systematically off, their labels could be downweighted during adjudication.
+The finding that trained student annotators achieve α = 0.801 on category but only α = 0.546 on specificity, while calibrated LLM panels achieve 70.8%+ both-unanimous on an easier sample, validates the synthetic experts hypothesis for rule-heavy classification tasks. The human labels are essential as a calibration anchor, but GenAI's advantage on multi-step reasoning tasks (like QV fact counting) is itself a key finding.
+
+The low specificity agreement is not annotator incompetence — it's evidence that the specificity construct requires cognitive effort that humans don't consistently invest at the 15-second-per-paragraph pace the task demands. The GenAI panel, which processes every paragraph with the same systematic attention to the IS/NOT lists and counting rules, achieves more consistent results on this specific dimension.
diff --git a/docs/STATUS.md b/docs/STATUS.md
index 9fb544a..411c815 100644
--- a/docs/STATUS.md
+++ b/docs/STATUS.md
@@ -1,4 +1,4 @@
-# Project Status — 2026-03-30
+# Project Status — 2026-04-02
 
 ## What's Done
 
@@ -33,70 +33,114 @@
 - [x] `docs/DAPT-PROCEDURE.md` — pre-flight checklist, commands, monitoring guide
 - [x] `docs/NARRATIVE.md` — 11 phases documented through TAPT completion
 
+## What's Done (since last update)
+
+### Human Labeling — Complete
+- [x] All 6 annotators completed 600 paragraphs each (3,600 labels total, 1,200 paragraphs × 3)
+- [x] BIBD assignment: each paragraph labeled by exactly 3 of 6 annotators
+- [x] Full data export: raw labels, timing, quiz sessions, metrics → `data/gold/`
+- [x] Comprehensive IRR analysis with 16 diagnostic charts → `data/gold/charts/`
+
+### Human Labeling Results
+
+| Metric | Category | Specificity | Both |
+|--------|----------|-------------|------|
+| Consensus (3/3 agree) | 56.8% | 42.3% | 27.0% |
+| Krippendorff's α | 0.801 | 0.546 | — |
+| Avg Cohen's κ | 0.612 | 0.440 | — |
+
+**Key findings:**
+- **Category is reliable (α=0.801)** — above the 0.80 threshold for reliable data
+- **Specificity is unreliable (α=0.546)** — driven primarily by one outlier annotator (Aaryan, +1.28 specificity levels vs Stage 1, κ=0.03-0.25 on specificity) and genuinely hard Spec 3↔4 boundary
+- **Human majority = Stage 1 majority on 83.3% of categories** — strong cross-validation
+- **Same confusion axes** in humans and GenAI: MR↔RMP (#1), BG↔MR (#2), N/O↔SI (#3)
+- **Excluding outlier annotator:** both-unanimous jumps from 5% → 50% on his paragraphs (+45pp)
+- **Timing:** 21.5 active hours total, median 14.9s per paragraph
+
+### Prompt v3.0
+- [x] Updated `SYSTEM_PROMPT` with codebook v3.0 rulings: materiality disclaimers → SI, SPACs → N/O, person-vs-function test for MR↔RMP
+- [x] Prompt version bumped from v2.5 → v3.0
+
+### GenAI Holdout Benchmark — In Progress
+Running 6 benchmark models + Opus on the 1,200 holdout paragraphs:
+
+| Model | Supplier | Est. Cost/call | Notes |
+|-------|----------|---------------|-------|
+| openai/gpt-5.4 | OpenAI | $0.009 | Structured output |
+| moonshotai/kimi-k2.5 | Moonshot | $0.006 | Structured output |
+| google/gemini-3.1-pro-preview | Google | $0.006 | Structured output |
+| z-ai/glm-5 | Zhipu | $0.006 | Structured output, exacto routing |
+| minimax/minimax-m2.7 | MiniMax | $0.002 | Raw text + fence stripping |
+| xiaomi/mimo-v2-pro | Xiaomi | $0.006 | Structured output, exacto routing |
+| anthropic/claude-opus-4.6 | Anthropic | $0 (subscription) | Agent SDK, parallel workers |
+
+Plus Stage 1 panel (gemini-flash-lite, mimo-v2-flash, grok-4.1-fast) already on file = **10 models, 8 suppliers**.
+
 ## What's In Progress
 
-### Human Labeling (139/1,200)
-- 3 of 6 annotators started: 68 + 50 + 21 paragraphs completed
-- Deployed via labelapp with quiz gating + warmup
-- Each annotator needs 600 paragraphs (BIBD assignment)
+### Opus Golden Re-Run
+- Opus golden labels being re-run on the correct 1,200 holdout paragraphs (previous run was on a stale sample due to `.sampled-ids.json` being overwritten)
+- Previous Opus labels (different 1,200 paragraphs) preserved at `data/annotations/golden/opus.wrong-sample.jsonl`
+- Using parallelized Agent SDK workers (concurrency=20)
+
+### GenAI Benchmark
+- 6 models running on holdout with v3.0 prompt, high concurrency (200)
+- Output: `data/annotations/bench-holdout/{model}.jsonl`
 
 ## What's Next (in dependency order)
 
-### 1. Fine-tuning pipeline (no blockers — can build now)
-Build the dual-head classifier (7-class category + 4-class specificity) with:
-- Shared ModernBERT backbone + 2 linear classification heads
-- Sample weighting from quality tiers (1.0 clean/headed/minor, 0.5 degraded)
-- Confidence-stratified label assembly (unanimous → majority → judge)
-- Train/val/test split with stratification
-- Ablation configs: base vs +DAPT vs +DAPT+TAPT
+### 1. Gold set adjudication (blocked on benchmark + Opus completion)
+Each paragraph will have **13+ independent annotations**: 3 human + 3 Stage 1 + 1 Opus + 6 benchmark models.
+Adjudication tiers:
+- **Tier 1:** 10+/13 agree → gold label, no intervention
+- **Tier 2:** Human majority + GenAI consensus agree → take consensus
+- **Tier 3:** Humans split, GenAI converges → expert adjudication using Opus reasoning traces
+- **Tier 4:** Universal disagreement → expert adjudication with documented reasoning
 
-### 3. Judge prompt v3.0 update (no blockers — can do now)
-Update `buildJudgePrompt()` with codebook v3.0 rulings:
-- Materiality disclaimers → Strategy Integration
-- SPACs → None/Other
-- Person-vs-function test for Management↔RMP
-Then re-bench against gold labels.
+### 2. Training data assembly (blocked on adjudication)
+- Unanimous Stage 1 labels (35,204 paragraphs) → full weight
+- Calibrated majority labels (~9-12K) → full weight
+- Judge high-confidence labels (~2-3K) → full weight
+- Quality tier weights: clean/headed/minor=1.0, degraded=0.5
 
-### 4. Training data assembly (blocked on judge + human labels)
-Combine all annotation sources into final training dataset:
-- Unanimous Stage 1 labels (35,204 paragraphs, ~97% accuracy)
-- Calibrated majority labels (~9-12K, ~85-90%)
-- Judge high-confidence labels (~2-3K, ~84%)
-- Judge low-confidence → downweight or exclude
-- Quality tier sample weights applied
-
-### 4. Judge production run (blocked on human gold labels)
-Run judge on ~409 unresolved + flagged majority cases. Validate against expanded gold set from human labels.
-
-### 5. Fine-tuning + ablations (blocked on steps 1-3)
+### 3. Fine-tuning + ablations (blocked on training data)
 7 experiments: {base, +DAPT, +DAPT+TAPT} × {with/without SCL} + best config.
+Dual-head classifier: shared ModernBERT backbone + 2 linear classification heads.
 
-### 6. Evaluation + paper (blocked on everything above)
-Full GenAI benchmark (9 models) on 1,200 holdout. Comparison tables. Write-up.
+### 4. Evaluation + paper (blocked on everything above)
+Full GenAI benchmark (10 models) on 1,200 holdout. Comparison tables. Write-up. IGNITE slides.
 
 ## Parallel Tracks
 
 ```
-Track A (GPU):  DAPT ✓ → TAPT ✓ → Fine-tuning → Eval
-                                                ↑
-Track B (API):  Judge v3 → Judge run ───────────┤
-                                                ↑
-Track C (Human): Labeling (139/1200) → Gold set validation
-                                                ↑
-Track D (Code): Fine-tune pipeline build ───────┘
+Track A (GPU):  DAPT ✓ → TAPT ✓ ──────────────→ Fine-tuning → Eval
+                                                        ↑
+Track B (API):  Opus re-run ─┐                          │
+                             ├→ Gold adjudication ──────┤
+Track C (API):  6-model bench┘                          │
+                                                        │
+Track D (Human): Labeling ✓ → IRR analysis ✓ ───────────┘
 ```
 
-DAPT + TAPT complete. Track D (fine-tune pipeline) can proceed now. Track B can start (prompt update) but production run waits for Track C. Everything converges at fine-tuning.
-
 ## Key File Locations
 
 | What | Where |
 |------|-------|
-| Patched paragraphs | `data/paragraphs/training.patched.jsonl` (49,795) |
+| Patched paragraphs | `data/paragraphs/paragraphs-clean.patched.jsonl` (49,795) |
 | Patched annotations | `data/annotations/stage1.patched.jsonl` (150,009) |
 | Quality scores | `data/paragraphs/quality/quality-scores.jsonl` (72,045) |
+| Human labels (raw) | `data/gold/human-labels-raw.jsonl` (3,600 labels) |
+| Human label metrics | `data/gold/metrics.json` |
+| Holdout paragraphs | `data/gold/paragraphs-holdout.jsonl` (1,200) |
+| Diagnostic charts | `data/gold/charts/*.png` (16 charts) |
+| Opus golden labels | `data/annotations/golden/opus.jsonl` (re-run on correct holdout) |
+| Benchmark annotations | `data/annotations/bench-holdout/{model}.jsonl` |
+| Original sampled IDs | `labelapp/.sampled-ids.original.json` (1,200 holdout PIDs) |
 | DAPT corpus | `data/dapt-corpus/shard-*.jsonl` (14,756 docs) |
 | DAPT config | `python/configs/dapt/modernbert.yaml` |
 | TAPT config | `python/configs/tapt/modernbert.yaml` |
 | DAPT checkpoint | `checkpoints/dapt/modernbert-large/final/` |
+| TAPT checkpoint | `checkpoints/tapt/modernbert-large/final/` |
 | Training CLI | `python/main.py dapt --config ...` |
+| Analysis script | `scripts/analyze-gold.py` |
+| Data dump script | `labelapp/scripts/dump-all.ts` |
diff --git a/ts/src/label/annotate.ts b/ts/src/label/annotate.ts
index 1f517be..282fadf 100644
--- a/ts/src/label/annotate.ts
+++ b/ts/src/label/annotate.ts
@@ -20,9 +20,9 @@ function buildProviderOptions(effort: ReasoningEffort) {
 }
 
 /** Extract cost from the result, checking both raw usage and providerMetadata. */
-function extractCost(result: { usage: unknown; providerMetadata?: unknown }): number {
+function extractCost(result: { usage?: unknown; providerMetadata?: unknown }): number {
   // Primary: raw usage.cost (always present in our smoke test)
-  const raw = result.usage as { raw?: { cost?: number } };
+  const raw = (result.usage ?? {}) as { raw?: { cost?: number } };
   if (raw.raw?.cost !== undefined) return raw.raw.cost;
 
   // Fallback: providerMetadata.openrouter.usage.cost
@@ -58,22 +58,42 @@ export async function annotateParagraph(
   const requestedAt = new Date().toISOString();
   const start = Date.now();
 
+  // Models that wrap JSON in markdown fences — skip structured output, parse raw text
+  const useRawText = modelId.startsWith("minimax/") || modelId.startsWith("moonshotai/");
+
   const result = await withRetry(
-    () =>
-      generateText({
+    async () => {
+      if (useRawText) {
+        const r = await generateText({
+          model: openrouter(modelId),
+          system: SYSTEM_PROMPT,
+          prompt: buildUserPrompt(paragraph),
+          temperature: 0,
+          providerOptions: buildProviderOptions(reasoningEffort),
+          abortSignal: AbortSignal.timeout(360_000),
+        });
+        const text = r.text.trim();
+        const fenceMatch = text.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
+        const jsonStr = fenceMatch ? fenceMatch[1]! : text;
+        const parsed = LabelOutputRaw.parse(JSON.parse(jsonStr));
+        // Getters (usage, response, providerMetadata) aren't copied by spread
+        return { ...r, output: parsed, usage: r.usage, response: r.response, providerMetadata: r.providerMetadata };
+      }
+      return generateText({
         model: openrouter(modelId),
         output: Output.object({ schema: LabelOutputRaw }),
         system: SYSTEM_PROMPT,
         prompt: buildUserPrompt(paragraph),
         temperature: 0,
         providerOptions: buildProviderOptions(reasoningEffort),
-        abortSignal: AbortSignal.timeout(120_000),
-      }),
+        abortSignal: AbortSignal.timeout(360_000),
+      });
+    },
     { label: `${modelId}:${paragraph.id}` },
   );
 
   const latencyMs = Date.now() - start;
-  const rawOutput = result.output;
+  const rawOutput = result.output as LabelOutputRaw;
   if (!rawOutput) throw new Error(`No output from ${modelId} for ${paragraph.id}`);
 
   return {
@@ -86,9 +106,9 @@ export async function annotateParagraph(
       stage,
       runId,
       promptVersion,
-      inputTokens: result.usage.inputTokens ?? 0,
-      outputTokens: result.usage.outputTokens ?? 0,
-      reasoningTokens: result.usage.outputTokenDetails?.reasoningTokens ?? 0,
+      inputTokens: result.usage?.inputTokens ?? 0,
+      outputTokens: result.usage?.outputTokens ?? 0,
+      reasoningTokens: result.usage?.outputTokenDetails?.reasoningTokens ?? 0,
       costUsd: extractCost(result),
       latencyMs,
       requestedAt,
@@ -147,9 +167,9 @@ export async function judgeParagraph(
       stage: "stage2-judge",
       runId,
       promptVersion,
-      inputTokens: result.usage.inputTokens ?? 0,
-      outputTokens: result.usage.outputTokens ?? 0,
-      reasoningTokens: result.usage.outputTokenDetails?.reasoningTokens ?? 0,
+      inputTokens: result.usage?.inputTokens ?? 0,
+      outputTokens: result.usage?.outputTokens ?? 0,
+      reasoningTokens: result.usage?.outputTokenDetails?.reasoningTokens ?? 0,
       costUsd: extractCost(result),
       latencyMs,
       requestedAt,
diff --git a/ts/src/label/prompts.ts b/ts/src/label/prompts.ts
index c1ec081..bd25199 100644
--- a/ts/src/label/prompts.ts
+++ b/ts/src/label/prompts.ts
@@ -1,6 +1,6 @@
 import type { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
 
-export const PROMPT_VERSION = "v2.5";
+export const PROMPT_VERSION = "v3.0";
 
 /** System prompt for all Stage 1 annotation and benchmarking. */
 export const SYSTEM_PROMPT = `You are an expert annotator classifying paragraphs from SEC cybersecurity disclosures (Form 10-K Item 1C and Form 8-K Item 1.05 filings).
@@ -24,11 +24,18 @@ CATEGORY TIEBREAKERS:
   - Paragraph ONLY discusses financial cost, insurance, or materiality of an incident WITHOUT describing the event → Strategy Integration (even if it says "the incident" or "the cybersecurity incident")
   - Brief mention of a past incident + materiality conclusion as the main point → Strategy Integration
   - Standalone materiality conclusion with no incident reference → Strategy Integration
+  - Materiality disclaimers ("have not materially affected our business strategy, results of operations, or financial condition") → Strategy Integration, even if boilerplate. A cross-reference to Risk Factors appended to a materiality assessment does NOT change the classification. Only pure cross-references with no materiality conclusion are None/Other.
+  - SPACs and shell companies explicitly stating they have no operations, no cybersecurity program, or no formal processes → None/Other regardless of incidental mentions of board oversight or risk acknowledgment. The absence of a program is not a description of a program.
   - Internal processes mentioning vendors as one component → Risk Management Process
   - Requirements imposed ON vendors → Third-Party Risk
   - Board oversight mentioned briefly + management roles as main focus → Management Role
   - Management mentioned briefly + board oversight as main focus → Board Governance
 
+PERSON-VS-FUNCTION TEST (Management Role vs Risk Management Process):
+  If a paragraph is about the PERSON (qualifications, credentials, background, tenure, career history) → Management Role.
+  If it's about what the role/program DOES (processes, activities, tools, frameworks) → Risk Management Process, even if a CISO/CIO/CTO title appears.
+  Test: would the paragraph still make sense if you removed the person's name, title, and credentials? If yes → the paragraph is about the function, not the person → Risk Management Process.
+
 ═══ SPECIFICITY ═══
 
 "Generic Boilerplate" — Could paste into any company's filing unchanged. No named entities, frameworks, roles, dates, or specific details.
diff --git a/ts/src/lib/openrouter.ts b/ts/src/lib/openrouter.ts
index 3abe805..f3bf4b7 100644
--- a/ts/src/lib/openrouter.ts
+++ b/ts/src/lib/openrouter.ts
@@ -17,11 +17,11 @@ export const STAGE2_JUDGE = "anthropic/claude-sonnet-4.6" as const;
 export const BENCHMARK_MODELS = [
   ...STAGE1_MODELS,
   "openai/gpt-5.4",
-  "anthropic/claude-sonnet-4.6",
+  "moonshotai/kimi-k2.5",
   "google/gemini-3.1-pro-preview",
-  "zhipu/glm-5",
-  "minimax/minimax-m2.7",
-  "xiaomi/mimo-v2-pro",
+  "z-ai/glm-5:exacto",
+  "minimax/minimax-m2.7:exacto",
+  "xiaomi/mimo-v2-pro:exacto",
 ] as const;
 
 export type Stage1Model = (typeof STAGE1_MODELS)[number];