From 78d1f978de59bf21f5825a2b36de3bf0ec03dcfb Mon Sep 17 00:00:00 2001
From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com>
Date: Sat, 28 Mar 2026 20:39:36 -0400
Subject: [PATCH] initial scrape and tag
---
.gitignore | 20 +
docs/LABELING-CODEBOOK.md | 792 +++++++++++++++++++++++++++++
docs/NARRATIVE.md | 449 ++++++++++++++++
docs/PROJECT-OVERVIEW.md | 243 +++++++++
docs/TECHNICAL-GUIDE.md | 478 +++++++++++++++++
docs/implementation-plan.md | 345 +++++++++++++
python/.gitignore | 10 +
python/.python-version | 1 +
python/main.py | 6 +
python/pyproject.toml | 7 +
ts/.gitignore | 34 ++
ts/package.json | 25 +
ts/scripts/dispute-crosstab.ts | 501 ++++++++++++++++++
ts/scripts/judge-bench.ts | 455 +++++++++++++++++
ts/scripts/judge-diag-batch.ts | 114 +++++
ts/scripts/judge-diag.ts | 70 +++
ts/scripts/mimo-pilot.ts | 245 +++++++++
ts/scripts/mimo-raw-test.ts | 83 +++
ts/scripts/mimo-test.ts | 44 ++
ts/scripts/model-bench.ts | 259 ++++++++++
ts/scripts/model-bias-analysis.ts | 470 +++++++++++++++++
ts/scripts/model-probe.ts | 79 +++
ts/scripts/pilot.ts | 476 +++++++++++++++++
ts/scripts/sample-disputes.ts | 229 +++++++++
ts/scripts/segment-analysis.ts | 432 ++++++++++++++++
ts/scripts/stage1-analyze.ts | 538 ++++++++++++++++++++
ts/scripts/stage1-run.ts | 158 ++++++
ts/src/analyze/corpus-stats.ts | 617 ++++++++++++++++++++++
ts/src/analyze/data-quality.ts | 450 ++++++++++++++++
ts/src/analyze/debug-parser.ts | 71 +++
ts/src/analyze/dedup-analysis.ts | 221 ++++++++
ts/src/analyze/diagnose-skips.ts | 152 ++++++
ts/src/cli.ts | 334 ++++++++++++
ts/src/extract/bulk-submissions.ts | 95 ++++
ts/src/extract/edgar-client.ts | 336 ++++++++++++
ts/src/extract/fast-reparse.ts | 221 ++++++++
ts/src/extract/parallel-reparse.ts | 96 ++++
ts/src/extract/parse-item1c.ts | 309 +++++++++++
ts/src/extract/pipeline.ts | 607 ++++++++++++++++++++++
ts/src/extract/reparse-worker.ts | 342 +++++++++++++
ts/src/extract/scan-8k-items.py | 98 ++++
ts/src/extract/segment.ts | 208 ++++++++
ts/src/label/annotate.ts | 158 ++++++
ts/src/label/batch.ts | 259 ++++++++++
ts/src/label/consensus.ts | 130 +++++
ts/src/label/prompts.ts | 292 +++++++++++
ts/src/lib/checkpoint.ts | 25 +
ts/src/lib/jsonl.ts | 80 +++
ts/src/lib/openrouter.ts | 33 ++
ts/src/lib/retry.ts | 70 +++
ts/src/schemas/annotation.ts | 27 +
ts/src/schemas/benchmark.ts | 28 +
ts/src/schemas/consensus.ts | 22 +
ts/src/schemas/gold.ts | 23 +
ts/src/schemas/index.ts | 18 +
ts/src/schemas/label.ts | 148 ++++++
ts/src/schemas/paragraph.ts | 25 +
ts/src/schemas/session.ts | 30 ++
ts/tsconfig.json | 30 ++
59 files changed, 12118 insertions(+)
create mode 100644 .gitignore
create mode 100644 docs/LABELING-CODEBOOK.md
create mode 100644 docs/NARRATIVE.md
create mode 100644 docs/PROJECT-OVERVIEW.md
create mode 100644 docs/TECHNICAL-GUIDE.md
create mode 100644 docs/implementation-plan.md
create mode 100644 python/.gitignore
create mode 100644 python/.python-version
create mode 100644 python/main.py
create mode 100644 python/pyproject.toml
create mode 100644 ts/.gitignore
create mode 100644 ts/package.json
create mode 100644 ts/scripts/dispute-crosstab.ts
create mode 100644 ts/scripts/judge-bench.ts
create mode 100644 ts/scripts/judge-diag-batch.ts
create mode 100644 ts/scripts/judge-diag.ts
create mode 100644 ts/scripts/mimo-pilot.ts
create mode 100644 ts/scripts/mimo-raw-test.ts
create mode 100644 ts/scripts/mimo-test.ts
create mode 100644 ts/scripts/model-bench.ts
create mode 100644 ts/scripts/model-bias-analysis.ts
create mode 100644 ts/scripts/model-probe.ts
create mode 100644 ts/scripts/pilot.ts
create mode 100644 ts/scripts/sample-disputes.ts
create mode 100644 ts/scripts/segment-analysis.ts
create mode 100644 ts/scripts/stage1-analyze.ts
create mode 100644 ts/scripts/stage1-run.ts
create mode 100644 ts/src/analyze/corpus-stats.ts
create mode 100644 ts/src/analyze/data-quality.ts
create mode 100644 ts/src/analyze/debug-parser.ts
create mode 100644 ts/src/analyze/dedup-analysis.ts
create mode 100644 ts/src/analyze/diagnose-skips.ts
create mode 100644 ts/src/cli.ts
create mode 100644 ts/src/extract/bulk-submissions.ts
create mode 100644 ts/src/extract/edgar-client.ts
create mode 100644 ts/src/extract/fast-reparse.ts
create mode 100644 ts/src/extract/parallel-reparse.ts
create mode 100644 ts/src/extract/parse-item1c.ts
create mode 100644 ts/src/extract/pipeline.ts
create mode 100644 ts/src/extract/reparse-worker.ts
create mode 100644 ts/src/extract/scan-8k-items.py
create mode 100644 ts/src/extract/segment.ts
create mode 100644 ts/src/label/annotate.ts
create mode 100644 ts/src/label/batch.ts
create mode 100644 ts/src/label/consensus.ts
create mode 100644 ts/src/label/prompts.ts
create mode 100644 ts/src/lib/checkpoint.ts
create mode 100644 ts/src/lib/jsonl.ts
create mode 100644 ts/src/lib/openrouter.ts
create mode 100644 ts/src/lib/retry.ts
create mode 100644 ts/src/schemas/annotation.ts
create mode 100644 ts/src/schemas/benchmark.ts
create mode 100644 ts/src/schemas/consensus.ts
create mode 100644 ts/src/schemas/gold.ts
create mode 100644 ts/src/schemas/index.ts
create mode 100644 ts/src/schemas/label.ts
create mode 100644 ts/src/schemas/paragraph.ts
create mode 100644 ts/src/schemas/session.ts
create mode 100644 ts/tsconfig.json
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..45ba100
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+# Data (too large for git)
+data/
+models/
+
+# Dependencies
+ts/node_modules/
+ts/bun.lock
+
+# Python
+python/.venv/
+python/uv.lock
+__pycache__/
+*.pyc
+
+# Editor
+.vscode/
+.idea/
+
+# OS
+.DS_Store
diff --git a/docs/LABELING-CODEBOOK.md b/docs/LABELING-CODEBOOK.md
new file mode 100644
index 0000000..622404b
--- /dev/null
+++ b/docs/LABELING-CODEBOOK.md
@@ -0,0 +1,792 @@
+# Labeling Codebook — SEC Cybersecurity Disclosure Quality
+
+This codebook is the authoritative reference for all human and GenAI labeling. Every annotator (human or model) must follow these definitions exactly. The LLM system prompt is generated directly from this document.
+
+---
+
+## Classification Design
+
+**Unit of analysis:** One paragraph from an SEC filing (Item 1C of 10-K, or Item 1.05/8.01/7.01 of 8-K).
+
+**Classification type:** Multi-class (single-label), NOT multi-label. Each paragraph receives exactly one content category.
+
+**Each paragraph receives two labels:**
+1. **Content Category** — single-label, one of 7 mutually exclusive classes
+2. **Specificity Level** — ordinal integer 1-4
+
+**None/Other policy:** Required. Since this is multi-class (not multi-label), we need a catch-all for paragraphs that don't fit the 6 substantive categories. A paragraph receives None/Other when it contains no cybersecurity-specific disclosure content (e.g., forward-looking statement disclaimers, section headers, general business language).
+
+---
+
+## Dimension 1: Content Category
+
+Each paragraph is assigned exactly **one** content category. If a paragraph spans multiple categories, assign the **dominant** category — the one that best describes the paragraph's primary communicative purpose.
+
+### Board Governance
+
+- **SEC basis:** Item 106(c)(1)
+- **Covers:** Board or committee oversight of cybersecurity risks, briefing frequency, board member cybersecurity expertise
+- **Key markers:** "Audit Committee," "Board of Directors oversees," "quarterly briefings," "board-level expertise," "board committee"
+- **Assign when:** The grammatical subject performing the primary action is the board or a board committee
+
+**Example texts:**
+
+> *"The Board of Directors oversees the Company's management of cybersecurity risks. The Board has delegated oversight of cybersecurity and data privacy matters to the Audit Committee."*
+> → Board Governance, Specificity 3 (names Audit Committee — firm-specific delegation)
+
+> *"Our Board of Directors recognizes the critical importance of maintaining the trust and confidence of our customers and stakeholders, and cybersecurity risk is an area of increasing focus for our Board."*
+> → Board Governance, Specificity 1 (could apply to any company — generic statement of intent)
+
+> *"The Audit Committee, which includes two members with significant technology and cybersecurity expertise, receives quarterly reports from the CISO and conducts an annual deep-dive review of the Company's cybersecurity program, threat landscape, and incident response readiness."*
+> → Board Governance, Specificity 3 (names specific committee, describes specific briefing cadence and scope)
+
+### Management Role
+
+- **SEC basis:** Item 106(c)(2)
+- **Covers:** The specific *person* filling a cybersecurity leadership position: their name, qualifications, career history, credentials, tenure, reporting lines, management committees responsible for cybersecurity
+- **Key markers:** "Chief Information Security Officer," "reports to," "years of experience," "management committee," "CISSP," "CISM," named individuals, career background
+- **Assign when:** The paragraph tells you something about *who the person is* — their background, credentials, experience, or reporting structure. A paragraph that names a CISO/CIO/CTO and then describes what the cybersecurity *program* does is NOT Management Role — it is Risk Management Process with an incidental role attribution. The test is whether the paragraph is about the **person** or about the **function**.
+
+**The person-vs-function test:** If you removed the role holder's name, title, qualifications, and background from the paragraph and the remaining content still describes substantive cybersecurity activities, processes, or oversight → the paragraph is about the function (Risk Management Process), not the person (Management Role). Management Role requires the person's identity or credentials to be the primary content, not just a brief attribution of who runs the program.
+
+**Example texts:**
+
+> *"Our Vice President of Information Security, who holds CISSP and CISM certifications and has over 20 years of experience in cybersecurity, reports directly to our Chief Information Officer and is responsible for leading our cybersecurity program."*
+> → Management Role, Specificity 3 — The paragraph is about the person: their credentials, experience, and reporting line. (named role, certifications, reporting line — all firm-specific)
+
+> *"Management is responsible for assessing and managing cybersecurity risks within the organization."*
+> → Management Role, Specificity 1 (generic, no named roles or structure)
+
+> *"Our CISO, Sarah Chen, leads a dedicated cybersecurity team of 35 professionals and presents monthly threat briefings to the executive leadership team. Ms. Chen joined the Company in 2019 after serving as Deputy CISO at a Fortune 100 financial services firm."*
+> → Management Role, Specificity 4 — The paragraph is about the person: their name, team size, background, prior role. (named individual, team size, specific frequency, prior employer — multiple verifiable facts)
+
+> *"Our CISO oversees the Company's cybersecurity program, which includes risk assessments, vulnerability scanning, penetration testing, and incident response planning aligned with the NIST CSF framework."*
+> → **Risk Management Process**, NOT Management Role — The CISO is mentioned once as attribution, but the paragraph is about what the program does. Remove "Our CISO oversees" and the paragraph still makes complete sense as a process description.
+
+### Risk Management Process
+
+- **SEC basis:** Item 106(b)
+- **Covers:** Risk assessment methodology, framework adoption (NIST, ISO, etc.), vulnerability management, monitoring, incident response planning, tabletop exercises, ERM integration
+- **Key markers:** "NIST CSF," "ISO 27001," "risk assessment," "vulnerability management," "tabletop exercises," "incident response plan," "SOC," "SIEM"
+- **Assign when:** The paragraph primarily describes the company's internal cybersecurity processes, tools, or methodologies
+
+**Example texts:**
+
+> *"We maintain a cybersecurity risk management program that is integrated into our overall enterprise risk management framework. Our program is designed to identify, assess, and manage material cybersecurity risks to our business."*
+> → Risk Management Process, Specificity 1 (generic, could apply to any company)
+
+> *"Our cybersecurity program is aligned with the NIST Cybersecurity Framework and incorporates elements of ISO 27001. We conduct regular risk assessments, vulnerability scanning, and penetration testing as part of our continuous monitoring approach."*
+> → Risk Management Process, Specificity 2 (names frameworks but no firm-specific detail)
+
+> *"We operate a 24/7 Security Operations Center that uses Splunk SIEM and CrowdStrike Falcon endpoint detection. Our incident response team conducts quarterly tabletop exercises simulating ransomware, supply chain compromise, and insider threat scenarios."*
+> → Risk Management Process, Specificity 4 (named tools, named vendor, specific exercise frequency and scenarios — verifiable)
+
+### Third-Party Risk
+
+- **SEC basis:** Item 106(b)
+- **Covers:** Vendor/supplier risk oversight, external assessor engagement, contractual security requirements, supply chain risk management
+- **Key markers:** "third-party," "service providers," "vendor risk," "external auditors," "supply chain," "SOC 2 report," "contractual requirements"
+- **Assign when:** The central topic is oversight of external parties' cybersecurity, not the company's own internal processes
+
+**Example texts:**
+
+> *"We face cybersecurity risks associated with our use of third-party service providers who may have access to our systems and data."*
+> → Third-Party Risk, Specificity 1 (generic risk statement)
+
+> *"Our vendor risk management program requires all third-party service providers with access to sensitive data to meet minimum security standards, including SOC 2 Type II certification or equivalent third-party attestation."*
+> → Third-Party Risk, Specificity 2 (names SOC 2 standard but no firm-specific detail about which vendors or how many)
+
+> *"We assessed 312 vendors in fiscal 2024 through our Third-Party Risk Management program. All Tier 1 vendors (those with access to customer PII or financial data) are required to provide annual SOC 2 Type II reports. In fiscal 2024, 14 vendors were placed on remediation plans and 3 vendor relationships were terminated for non-compliance."*
+> → Third-Party Risk, Specificity 4 (specific numbers, specific actions, specific criteria — all verifiable)
+
+### Incident Disclosure
+
+- **SEC basis:** 8-K Item 1.05 (and 8.01/7.01 post-May 2024)
+- **Covers:** Description of cybersecurity incidents — nature, scope, timing, impact assessment, remediation actions, ongoing investigation
+- **Key markers:** "unauthorized access," "detected," "incident," "remediation," "impacted," "forensic investigation," "breach," "compromised"
+- **Assign when:** The paragraph primarily describes what happened in a cybersecurity incident
+
+**Example texts:**
+
+> *"We have experienced, and may in the future experience, cybersecurity incidents that could have a material adverse effect on our business, results of operations, and financial condition."*
+> → Incident Disclosure, Specificity 1 (hypothetical — no actual incident described. Note: if this appears in Item 1C rather than an 8-K, consider None/Other instead since it's generic risk language)
+
+> *"On January 15, 2024, we detected unauthorized access to our customer support portal. The threat actor exploited a known vulnerability in a third-party software component. Upon detection, we activated our incident response plan, contained the intrusion, and engaged Mandiant for forensic investigation."*
+> → Incident Disclosure, Specificity 4 (specific date, specific system, named forensic firm, specific attack vector — all verifiable)
+
+> *"In December 2023, the Company experienced a cybersecurity incident involving unauthorized access to certain internal systems. The Company promptly took steps to contain and remediate the incident, including engaging third-party cybersecurity experts."*
+> → Incident Disclosure, Specificity 3 (specific month, specific action — but no named firms or quantified impact)
+
+### Strategy Integration
+
+- **SEC basis:** Item 106(b)(2)
+- **Covers:** Material impact (or lack thereof) on business strategy or financials, cybersecurity insurance, investment/resource allocation, cost of incidents
+- **Key markers:** "business strategy," "insurance," "investment," "material," "financial condition," "budget," "not materially affected," "results of operations"
+- **Assign when:** The paragraph primarily discusses business/financial consequences or strategic response to cyber risk, not the risk management activities themselves
+- **Includes materiality disclaimers:** Any paragraph that explicitly assesses whether cybersecurity risks have or could "materially affect" the company's business, strategy, financial condition, or results of operations is Strategy Integration — even if the assessment is boilerplate. The company is making a strategic judgment about cyber risk impact, which is the essence of this category. A cross-reference to Risk Factors appended to a materiality assessment does not change the classification.
+
+**Example texts:**
+
+> *"Cybersecurity risks, including those described above, have not materially affected, and are not reasonably likely to materially affect, our business strategy, results of operations, or financial condition."*
+> → Strategy Integration, Specificity 1 (boilerplate materiality statement — nearly identical language appears across thousands of filings, but it IS a materiality assessment)
+
+> *"We have not identified any cybersecurity incidents or threats that have materially affected us. For more information, see Item 1A, Risk Factors."*
+> → Strategy Integration, Specificity 1 — The materiality assessment is the substantive content. The cross-reference is noise and does not pull the paragraph to None/Other.
+
+> *"We maintain cybersecurity insurance coverage as part of our overall risk management strategy to help mitigate potential financial losses from cybersecurity incidents."*
+> → Strategy Integration, Specificity 2 (mentions insurance but no specifics)
+
+> *"We increased our cybersecurity budget by 32% to $45M in fiscal 2024, representing 0.8% of revenue. We maintain cyber liability insurance with $100M in aggregate coverage through AIG and Chubb, with a $5M deductible per incident."*
+> → Strategy Integration, Specificity 4 (dollar amounts, percentages, named insurers, specific deductible — all verifiable)
+
+### None/Other
+
+- **Covers:** Forward-looking statement disclaimers, section headers, cross-references to other filing sections, general business language that mentions cybersecurity incidentally, text erroneously extracted from outside Item 1C/1.05
+- **No specificity scoring needed:** Always assign Specificity 1 for None/Other paragraphs (since there is no cybersecurity disclosure to rate)
+- **SPACs and shell companies:** Companies that explicitly state they have no operations, no cybersecurity program, or no formal processes receive None/Other regardless of incidental mentions of board oversight or risk acknowledgment. The absence of a program is not a description of a program. Paragraphs like "We have not adopted any cybersecurity risk management program. Our board is generally responsible for oversight" are None/Other — the board mention is perfunctory, not substantive governance disclosure.
+- **Distinguishing from Strategy Integration:** A pure cross-reference ("See Item 1A, Risk Factors") with no materiality assessment is None/Other. But if the paragraph includes an explicit materiality conclusion ("have not materially affected our business strategy"), it becomes Strategy Integration even if a cross-reference is also present. The test: does the paragraph make a substantive claim about cybersecurity's impact on the business? If yes → Strategy Integration. If it only points elsewhere → None/Other.
+
+**Example texts:**
+
+> *"This Annual Report on Form 10-K contains forward-looking statements within the meaning of Section 27A of the Securities Act of 1933, as amended, and Section 21E of the Securities Exchange Act of 1934, as amended."*
+> → None/Other, Specificity 1
+
+> *"Item 1C. Cybersecurity"*
+> → None/Other, Specificity 1 (section header only)
+
+> *"For additional information about risks related to our information technology systems, see Part I, Item 1A, 'Risk Factors.'"*
+> → None/Other, Specificity 1 (cross-reference, no disclosure content)
+
+> *"We are a special purpose acquisition company with no business operations. We have not adopted any cybersecurity risk management program. Our board of directors is generally responsible for oversight of cybersecurity risks, if any."*
+> → None/Other, Specificity 1 — No substantive disclosure. The board mention is incidental; the company explicitly has no program to disclose.
+
+> *"We do not consider that we face significant cybersecurity risk and have not adopted any formal processes for assessing cybersecurity risk."*
+> → None/Other, Specificity 1 — Absence of a program is not a program description.
+
+---
+
+## Category Decision Rules
+
+### Rule 1: Dominant Category
+If a paragraph spans multiple categories, assign the one whose topic occupies the most text or is the paragraph's primary communicative purpose.
+
+### Rule 2: Board vs. Management
+| Signal | Category |
+|--------|----------|
+| Board/committee is the grammatical subject | Board Governance |
+| Board delegates responsibility to management | Board Governance |
+| Management role reports TO the board | Management Role |
+| Management role's qualifications are described | Management Role |
+| "Board oversees... CISO reports to Board quarterly" | Board Governance (board is primary actor) |
+| "CISO reports quarterly to the Board on..." | Management Role (CISO is primary actor) |
+
+### Rule 2b: Management Role vs. Risk Management Process (the person-vs-function test)
+
+This is the single most common source of annotator disagreement. The line is: **is the paragraph about the person or about the function?**
+
+| Signal | Category |
+|--------|----------|
+| The person's background, credentials, tenure, experience, education, career history | Management Role |
+| The person's name is given | Management Role (strong signal) |
+| Reporting lines as primary content (who reports to whom, management committee structure) | Management Role |
+| Role title mentioned as attribution ("Our CISO oversees...") followed by process description | **Risk Management Process** |
+| Activities, tools, methodologies, frameworks as the primary content | **Risk Management Process** |
+| The paragraph would still make sense if you removed the role title and replaced it with "the Company" | **Risk Management Process** |
+
+**Key principle:** Naming a cybersecurity leadership title (CISO, CIO, CTO, VP of Security) does not make a paragraph Management Role. The title is often an incidental attribution — the paragraph names who is responsible then describes what the program does. If the paragraph's substantive content is about processes, activities, or tools, it is Risk Management Process regardless of how many times a role title appears. Management Role requires the paragraph's content to be about the *person* — who they are, what makes them qualified, how long they've served, what their background is.
+
+### Rule 3: Risk Management vs. Third-Party
+| Signal | Category |
+|--------|----------|
+| Company's own internal processes, tools, teams | Risk Management Process |
+| Third parties mentioned as ONE component of internal program | Risk Management Process |
+| Vendor oversight is the CENTRAL topic | Third-Party Risk |
+| External assessor hired to test the company | Risk Management Process (they serve the company) |
+| Requirements imposed ON vendors | Third-Party Risk |
+
+### Rule 4: Incident vs. Strategy
+| Signal | Category |
+|--------|----------|
+| Describes what happened (timeline, scope, response) | Incident Disclosure |
+| Describes business impact of an incident (costs, revenue, insurance claim) | Strategy Integration |
+| Mixed: "We detected X... at a cost of $Y" | Assign based on which is dominant — if cost is one sentence in a paragraph about the incident → Incident Disclosure |
+
+### Rule 5: None/Other Threshold
+Assign None/Other ONLY when the paragraph contains no substantive cybersecurity disclosure content. If a paragraph mentions cybersecurity even briefly in service of a disclosure obligation, assign the relevant content category.
+
+**Exception — SPACs and no-operations companies:** A paragraph that explicitly states the company has no cybersecurity program, no operations, or no formal processes is None/Other even if it perfunctorily mentions board oversight or risk acknowledgment. The absence of a program is not substantive disclosure.
+
+### Rule 6: Materiality Disclaimers → Strategy Integration
+Any paragraph that explicitly assesses whether cybersecurity risks or incidents have "materially affected" (or are "reasonably likely to materially affect") the company's business strategy, results of operations, or financial condition is **Strategy Integration** — even when the assessment is boilerplate. The materiality assessment is the substantive content. A cross-reference to Risk Factors appended to a materiality assessment does not change the classification to None/Other. Only a *pure* cross-reference with no materiality conclusion is None/Other.
+
+---
+
+## Borderline Cases
+
+### Case 1: Framework mention + firm-specific fact
+> *"We follow NIST CSF and our CISO oversees the program."*
+
+The NIST mention → Level 2 anchor. The CISO reference → firm-specific. **Apply boundary rule 2→3: "Does it mention anything unique to THIS company?" Yes (CISO role exists at this company) → Level 3.**
+
+### Case 2: Named role but generic description
+> *"Our Chief Information Security Officer is responsible for managing cybersecurity risks."*
+
+Names a role (CISO) → potentially Level 3. But the description is completely generic. **Apply judgment: the mere existence of a CISO title is firm-specific (not all companies have one). → Level 3.** If the paragraph said "a senior executive is responsible" without naming the role → Level 1.
+
+### Case 3: Specificity-rich None/Other
+> *"On March 15, 2025, we filed a Current Report on Form 8-K disclosing a cybersecurity incident. For details, see our Form 8-K filed March 15, 2025, accession number 0001193125-25-012345."*
+
+Contains specific dates and filing numbers, but the paragraph itself contains no disclosure content — it's a cross-reference. → **None/Other, Specificity 1.** Specificity only applies to disclosure substance, not to metadata.
+
+### Case 4: Hypothetical incident language in 10-K
+> *"We may experience cybersecurity incidents that could disrupt our operations."*
+
+This appears in Item 1C, not an 8-K. It describes no actual incident. → **Risk Management Process or Strategy Integration (depending on context), NOT Incident Disclosure.** Incident Disclosure is reserved for descriptions of events that actually occurred.
+
+### Case 5: Dual-category paragraph
+> *"The Audit Committee oversees our cybersecurity program, which is led by our CISO who holds CISSP certification and reports quarterly to the Committee."*
+
+Board (Audit Committee oversees) + Management (CISO qualifications, reporting). The opening clause sets the frame: this is about the Audit Committee's oversight, and the CISO detail is subordinate. → **Board Governance, Specificity 3.**
+
+### Case 6: Management Role vs. Risk Management Process — the person-vs-function test
+> *"Our CISO oversees the Company's cybersecurity program, which includes risk assessments, vulnerability scanning, and incident response planning. The program is aligned with the NIST CSF framework and integrated into our enterprise risk management process."*
+
+The CISO is named as attribution, but the paragraph is about what the program does — assessments, scanning, response planning, framework alignment, ERM integration. Remove "Our CISO oversees" and it still makes complete sense as a process description. → **Risk Management Process, Specificity 2** (NIST CSF framework, no firm-specific facts beyond that).
+
+> *"Our CISO has over 20 years of experience in cybersecurity and holds CISSP and CISM certifications. She reports directly to the CIO and oversees a team of 12 security professionals. Prior to joining the Company in 2019, she served as VP of Security at a Fortune 500 technology firm."*
+
+The entire paragraph is about the person: experience, certifications, reporting line, team size, tenure, prior role. → **Management Role, Specificity 4** (years of experience + team headcount + named certifications = multiple QV-eligible facts).
+
+### Case 7: Materiality disclaimer — Strategy Integration vs. None/Other
+> *"We have not identified any cybersecurity incidents or threats that have materially affected our business strategy, results of operations, or financial condition. However, like other companies, we have experienced threats from time to time. For more information, see Item 1A, Risk Factors."*
+
+Contains an explicit materiality assessment ("materially affected... business strategy, results of operations, or financial condition"). The cross-reference and generic threat mention are noise. → **Strategy Integration, Specificity 1.**
+
+> *"For additional information about risks related to our information technology systems, see Part I, Item 1A, 'Risk Factors.'"*
+
+No materiality assessment. Pure cross-reference. → **None/Other, Specificity 1.**
+
+### Case 8: SPAC / no-operations company
+> *"We are a special purpose acquisition company with no business operations. We have not adopted any cybersecurity risk management program or formal processes. Our Board of Directors is generally responsible for oversight of cybersecurity risks, if any. We have not encountered any cybersecurity incidents since our IPO."*
+
+Despite touching RMP (no program), Board Governance (board is responsible), and Strategy Integration (no incidents), the paragraph contains no substantive disclosure. The company explicitly has no program, and the board mention is perfunctory ("generally responsible... if any"). The absence of a program is not a program description. → **None/Other, Specificity 1.**
+
+---
+
+## Dimension 2: Specificity Level
+
+Each paragraph receives a specificity level (1-4) indicating how company-specific the disclosure is. Apply the decision test in order — stop at the first "yes."
+
+### Decision Test
+
+1. **Count hard verifiable facts ONLY** (specific dates, dollar amounts, headcounts/percentages, named third-party firms, named products/tools, named certifications). TWO or more? → **Quantified-Verifiable (4)**
+2. **Does it contain at least one fact from the IS list below?** → **Firm-Specific (3)**
+3. **Does it name a recognized standard** (NIST, ISO 27001, SOC 2, CIS, GDPR, PCI DSS, HIPAA)? → **Sector-Adapted (2)**
+4. **None of the above?** → **Generic Boilerplate (1)**
+
+None/Other paragraphs always receive Specificity 1.
+
+### Level Definitions
+
+| Level | Name | Description |
+|-------|------|-------------|
+| 1 | Generic Boilerplate | Could paste into any company's filing unchanged. No named entities, frameworks, roles, dates, or specific details. |
+| 2 | Sector-Adapted | Names a specific recognized standard (NIST, ISO 27001, SOC 2, etc.) but contains nothing unique to THIS company. General practices (pen testing, vulnerability scanning, tabletop exercises) do NOT qualify — only named standards. |
+| 3 | Firm-Specific | Contains at least one fact from the IS list that identifies something unique to THIS company's disclosure. |
+| 4 | Quantified-Verifiable | Contains TWO or more hard verifiable facts (see QV-eligible list). One fact = Firm-Specific, not QV. |
+
+### ✓ IS a Specific Fact (any ONE → at least Firm-Specific)
+
+- **Cybersecurity-specific titles:** CISO, CTO, CIO, VP of IT/Security, Information Security Officer, Director of IT Security, HSE Director overseeing cybersecurity, Chief Digital Officer (when overseeing cyber), Cybersecurity Director
+- **Named non-generic committees:** Technology Committee, Cybersecurity Committee, Risk Committee, ERM Committee (NOT "Audit Committee" — that exists at every public company)
+- **Specific team/department compositions:** "Legal, Compliance, and Finance" (but NOT just "a cross-functional team")
+- **Specific dates:** "In December 2023", "On May 6, 2024", "fiscal 2025"
+- **Named internal programs with unique identifiers:** "Cyber Incident Response Plan (CIRP)" (must have a distinguishing name/abbreviation — generic "incident response plan" does not qualify)
+- **Named products, systems, tools:** Splunk, CrowdStrike Falcon, Azure Sentinel, ServiceNow
+- **Named third-party firms:** Mandiant, Deloitte, CrowdStrike, PwC
+- **Specific numbers:** headcounts, dollar amounts, percentages, exact durations ("17 years", "12 professionals")
+- **Certification claims:** "We maintain ISO 27001 certification" (holding a certification is more than naming a standard)
+- **Named universities in credential context:** "Ph.D. from Princeton University" (independently verifiable)
+
+### ✗ IS NOT a Specific Fact (do NOT use to justify Firm-Specific)
+
+- **Generic governance:** "the Board", "Board of Directors", "management", "Audit Committee", "the Committee"
+- **Generic C-suite:** CEO, CFO, COO, President, General Counsel — these exist at every company and are not cybersecurity-specific
+- **Generic IT leadership (NOT cybersecurity-specific):** "Head of IT", "IT Manager", "Director of IT", "Chief Compliance Officer", "Associate Vice President of IT" — these are general corporate/IT titles, not cybersecurity roles per the IS list
+- **Unnamed entities:** "third-party experts", "external consultants", "cybersecurity firms", "managed service provider"
+- **Generic cadences:** "quarterly", "annual", "periodic", "regular" — without exact dates
+- **Boilerplate phrases:** "cybersecurity risks", "material adverse effect", "business operations", "financial condition"
+- **Standard incident language:** "forensic investigation", "law enforcement", "regulatory obligations", "incident response protocols"
+- **Vague quantifiers:** "certain systems", "some employees", "a number of", "a portion of"
+- **Common practices:** "penetration testing", "vulnerability scanning", "tabletop exercises", "phishing simulations", "security awareness training"
+- **Generic program names:** "incident response plan", "business continuity plan", "cybersecurity program", "Third-Party Risk Management Program", "Company-wide training" — no unique identifier or distinguishing abbreviation
+- **Company self-references:** the company's own name, "the Company", "the Bank", subsidiary names, filing form types
+- **Company milestones:** "since our IPO", "since inception" — not cybersecurity facts
+
+### QV-Eligible Facts (count toward the 2-fact threshold for Quantified-Verifiable)
+
+✓ Specific dates (month+year or exact date)
+✓ Dollar amounts, headcounts, percentages
+✓ Named third-party firms (Mandiant, CrowdStrike, Deloitte)
+✓ Named products/tools (Splunk, Azure Sentinel)
+✓ Named certifications held by individuals (CISSP, CISM, CEH)
+✓ Years of experience as a specific number ("17 years", "over 20 years")
+✓ Named universities in credential context
+
+**Do NOT count toward QV** (these trigger Firm-Specific but not QV):
+✗ Named roles (CISO, CIO)
+✗ Named committees
+✗ Named frameworks (NIST, ISO 27001) — these trigger Sector-Adapted
+✗ Team compositions, reporting structures
+✗ Named internal programs
+✗ Generic degrees without named university ("BS in Management")
+
+### Validation Step
+
+Before finalizing specificity, review the extracted facts. Remove any that appear on the NOT list. If no facts remain after filtering → Generic Boilerplate (or Sector-Adapted if a named standard is present). Do not let NOT-list items inflate the specificity rating.
+
+---
+
+## LLM Response Schema
+
+The exact Zod schema passed to `generateObject`. This is the contract between the LLM and our pipeline.
+
+```typescript
+import { z } from "zod";
+
+export const ContentCategory = z.enum([
+ "Board Governance",
+ "Management Role",
+ "Risk Management Process",
+ "Third-Party Risk",
+ "Incident Disclosure",
+ "Strategy Integration",
+ "None/Other",
+]);
+
+export const SpecificityLevel = z.union([
+ z.literal(1),
+ z.literal(2),
+ z.literal(3),
+ z.literal(4),
+]);
+
+export const Confidence = z.enum(["high", "medium", "low"]);
+
+export const LabelOutput = z.object({
+ content_category: ContentCategory
+ .describe("The single most applicable content category for this paragraph"),
+ specificity_level: SpecificityLevel
+ .describe("1=generic boilerplate, 2=sector-adapted, 3=firm-specific, 4=quantified-verifiable"),
+ category_confidence: Confidence
+ .describe("high=clear-cut, medium=some ambiguity, low=genuinely torn between categories"),
+ specificity_confidence: Confidence
+ .describe("high=clear-cut, medium=borderline adjacent levels, low=could argue for 2+ levels"),
+ reasoning: z.string()
+ .describe("Brief 1-2 sentence justification citing specific evidence from the text"),
+});
+```
+
+**Output example:**
+```json
+{
+ "content_category": "Risk Management Process",
+ "specificity_level": 3,
+ "category_confidence": "high",
+ "specificity_confidence": "medium",
+ "reasoning": "Names NIST CSF (sector-adapted) and describes quarterly tabletop exercises specific to this company's program, pushing to firm-specific. Specificity borderline 2/3 — tabletop exercises could be generic or firm-specific depending on interpretation."
+}
+```
+
+---
+
+## System Prompt
+
+> **Note:** The system prompt below is the v1.0 template from the initial codebook. The production Stage 1 prompt is **v2.5** (in `ts/src/label/prompts.ts`), which incorporates the IS/NOT lists, calibration examples, validation step, and decision test from this codebook. The Stage 2 judge prompt (`buildJudgePrompt()` in the same file) adds dynamic disambiguation rules and confidence calibration. **This codebook is the source of truth; the prompt mirrors it.**
+
+The v1.0 template is preserved below for reference. See `ts/src/label/prompts.ts` for the current production prompt.
+
+```
+You are an expert annotator classifying paragraphs from SEC cybersecurity disclosures (Form 10-K Item 1C and Form 8-K Item 1.05 filings) under SEC Release 33-11216.
+
+For each paragraph, assign exactly two labels:
+
+(a) content_category — the single most applicable category:
+ - "Board Governance": Board/committee oversight of cyber risk, briefing cadence, board member cyber expertise. SEC basis: Item 106(c)(1).
+ - "Management Role": CISO/CTO/CIO identification, qualifications, reporting lines, management committees. SEC basis: Item 106(c)(2).
+ - "Risk Management Process": Risk assessment methods, framework adoption (NIST, ISO), vulnerability management, monitoring, incident response planning, tabletop exercises, ERM integration. SEC basis: Item 106(b).
+ - "Third-Party Risk": Vendor/supplier security oversight, external assessor requirements, contractual security standards, supply chain risk. SEC basis: Item 106(b).
+ - "Incident Disclosure": Description of actual cybersecurity incidents — nature, scope, timing, impact, remediation. SEC basis: 8-K Item 1.05.
+ - "Strategy Integration": Material impact on business strategy/financials, cyber insurance, investment/resource allocation. SEC basis: Item 106(b)(2).
+ - "None/Other": Forward-looking disclaimers, section headers, cross-references, non-cybersecurity content.
+
+If a paragraph spans multiple categories, assign the DOMINANT one — the category that best describes the paragraph's primary communicative purpose.
+
+(b) specificity_level — integer 1 through 4:
+ 1 = Generic Boilerplate: Could apply to any company unchanged. Conditional language ("may," "could"). No named entities or frameworks.
+ 2 = Sector-Adapted: Names frameworks/standards (NIST, ISO, SOC 2) or industry-specific terms, but nothing unique to THIS company.
+ 3 = Firm-Specific: Contains at least one fact unique to this company — named roles, specific committees, concrete reporting lines, named programs.
+ 4 = Quantified-Verifiable: Two or more verifiable facts — dollar amounts, dates, headcounts, percentages, named third-party firms, audit results.
+
+BOUNDARY RULES (apply when torn between adjacent levels):
+ 1 vs 2: "Does it name ANY framework, standard, or industry-specific term?" → Yes = 2
+ 2 vs 3: "Does it mention anything unique to THIS company?" → Yes = 3
+ 3 vs 4: "Does it contain TWO OR MORE independently verifiable facts?" → Yes = 4
+
+SPECIAL RULES:
+ - None/Other paragraphs always get specificity_level = 1.
+ - Hypothetical incident language ("we may experience...") in a 10-K is NOT Incident Disclosure. It is Risk Management Process or Strategy Integration.
+ - Incident Disclosure is only for descriptions of events that actually occurred.
+
+CONFIDENCE RATINGS (per dimension):
+ - "high": Clear-cut classification with no reasonable alternative.
+ - "medium": Some ambiguity, but one option is clearly stronger.
+ - "low": Genuinely torn between two or more options.
+Be honest — overconfident ratings on hard cases are worse than admitting uncertainty.
+
+Respond with valid JSON matching the required schema. The "reasoning" field should cite specific words or facts from the paragraph that justify your labels (1-2 sentences).
+```
+
+---
+
+## User Prompt Template
+
+```
+Company: {company_name} ({ticker})
+Filing type: {filing_type}
+Filing date: {filing_date}
+Section: {sec_item}
+
+Paragraph:
+{paragraph_text}
+```
+
+---
+
+## Stage 2 Judge Prompt
+
+Used when Stage 1 annotators disagree. The judge sees the paragraph plus all three prior annotations in randomized order.
+
+```
+You are adjudicating a labeling disagreement among three independent annotators. Each applied the same codebook but reached different conclusions.
+
+Review all three opinions below, then provide YOUR OWN independent label based on the codebook definitions above. Do not default to majority vote — use your own expert judgment. If you agree with one annotator's reasoning, explain why their interpretation is correct.
+
+Company: {company_name} ({ticker})
+Filing type: {filing_type}
+Filing date: {filing_date}
+Section: {sec_item}
+
+Paragraph:
+{paragraph_text}
+
+--- Prior annotations (randomized order) ---
+
+Annotator A: content_category="{cat_a}", specificity_level={spec_a}
+ Reasoning: "{reason_a}"
+
+Annotator B: content_category="{cat_b}", specificity_level={spec_b}
+ Reasoning: "{reason_b}"
+
+Annotator C: content_category="{cat_c}", specificity_level={spec_c}
+ Reasoning: "{reason_c}"
+```
+
+---
+
+## Cost and Time Tracking
+
+### Per-Annotation Record
+
+Every API call produces an `Annotation` record with full provenance:
+
+```typescript
+provenance: {
+ modelId: string, // OpenRouter model ID e.g. "google/gemini-3.1-flash-lite-preview"
+ provider: string, // Upstream provider e.g. "google", "xai", "anthropic"
+ generationId: string, // OpenRouter generation ID (from response id field)
+ stage: "stage1" | "stage2-judge" | "benchmark",
+ runId: string, // UUID per batch run
+ promptVersion: string, // "v1.0" — tracks prompt iterations
+ inputTokens: number, // From usage.prompt_tokens
+ outputTokens: number, // From usage.completion_tokens
+ reasoningTokens: number, // From usage.completion_tokens_details.reasoning_tokens
+ costUsd: number, // REAL cost from OpenRouter usage.cost (not estimated)
+ latencyMs: number, // Wall clock per request
+ requestedAt: string, // ISO datetime
+}
+```
+
+### Cost Source
+
+OpenRouter returns **actual cost** in every response body under `usage.cost` (USD). No estimation needed. Each response also includes a `generationId` (the `id` field) which we store in every annotation record. This enables:
+- Audit trail: look up any annotation on OpenRouter's dashboard
+- Richer stats via `GET /api/v1/generation?id={generationId}` (latency breakdown, provider routing, native token counts)
+
+### Aggregation Levels
+
+| Level | What | Where |
+|-------|------|-------|
+| Per-annotation | Single API call cost + latency | In each Annotation JSONL record |
+| Per-model | Sum across all annotations for that model | `bun sec label:cost` |
+| Per-stage | Stage 1 total, Stage 2 total | `bun sec label:cost` |
+| Per-phase | Labeling total, benchmarking total | `bun sec label:cost` |
+| Project total | Everything | `bun sec label:cost` |
+
+### Time Tracking
+
+| Metric | How |
+|--------|-----|
+| Per-annotation latency | `Date.now()` before/after API call |
+| Batch throughput | paragraphs/minute computed from batch start/end |
+| Stage 1 wall clock | Logged at batch start and end |
+| Stage 2 wall clock | Logged at batch start and end |
+| Total labeling time | Sum of all batch durations |
+| Per-model benchmark time | Tracked during benchmark runs |
+
+All timing is logged to `data/metadata/cost-log.jsonl` with entries like:
+
+```json
+{
+ "event": "batch_complete",
+ "stage": "stage1",
+ "modelId": "openai/gpt-oss-120b",
+ "paragraphsProcessed": 50000,
+ "wallClockSeconds": 14400,
+ "totalCostUsd": 38.50,
+ "throughputPerMinute": 208.3,
+ "timestamp": "2026-03-29T10:30:00Z"
+}
+```
+
+---
+
+## NIST CSF 2.0 Mapping
+
+For academic grounding:
+
+| Our Category | NIST CSF 2.0 |
+|-------------|-------------|
+| Board Governance | GOVERN (GV.OV, GV.RR) |
+| Management Role | GOVERN (GV.RR, GV.RM) |
+| Risk Management Process | IDENTIFY (ID.RA), GOVERN (GV.RM), PROTECT (all) |
+| Third-Party Risk | GOVERN (GV.SC) |
+| Incident Disclosure | DETECT, RESPOND, RECOVER |
+| Strategy Integration | GOVERN (GV.OC, GV.RM) |
+
+---
+
+## Prompt Versioning
+
+Track prompt changes so we can attribute label quality to specific prompt versions:
+
+| Version | Date | N | Change |
+|---------|------|---|--------|
+| v1.0 | 2026-03-27 | 40 | Initial codebook-aligned prompt |
+| v1.1 | 2026-03-28 | 40 | Added calibration examples, category decision rules. Cat 95%, Spec 68%, Both 62%. |
+| v1.2 | 2026-03-28 | 40 | Expanded "what counts as unique" + materiality rule. REGRESSED (88% cat). |
+| v2.0 | 2026-03-28 | 40 | Chain-of-thought schema with specific_facts array + algorithmic specificity. Gemini/Grok 5/5, GPT-OSS broken. |
+| v2.1 | 2026-03-28 | 40 | Two-tier facts (organizational vs verifiable) + text enum labels. Gemini/Grok perfect but nano overrates. |
+| v2.2 | 2026-03-28 | 40 | Decision-test format, simplified facts, "NOT a fact" list. Cat 95%, Spec 68%, Both 65%, Consensus 100%. |
+| v2.2 | 2026-03-28 | 500 | 500-sample baseline. Cat 85.0%, Spec 60.8%, Both 51.4%, Consensus 99.6%, Spread 0.240. |
+| v2.3 | 2026-03-28 | 500 | Tightened Sector-Adapted, expanded IS/NOT lists, QV boundary rules. Spec 72.0%, Both 59.2%. [1,1,2] eliminated. |
+| v2.4 | 2026-03-28 | 500 | Validation step, schema constraint on specific_facts. Spec 78.6%, Both 66.8%. Nano overrating fixed. |
+| v2.5 | 2026-03-28 | 500 | Improved Inc↔Strat tiebreaker, QV calibration examples. **PRODUCTION**: Cat 86.8%, Spec 81.0%, Both 70.8%, Consensus 99.4%, Spread 0.130. Inc↔Strat eliminated. |
+| v2.6 | 2026-03-28 | 500 | Changed category defs to TEST: format. REGRESSED (Both 67.8%). |
+| v2.7 | 2026-03-28 | 500 | Added COMMON MISTAKES section. 100% consensus but Both 67.6%. |
+| v3.0 | 2026-03-29 | — | **Codebook overhaul.** Three rulings: (A) materiality disclaimers → Strategy Integration, (B) SPACs/no-ops → None/Other, (C) person-vs-function test for Mgmt Role vs RMP. Added full IS/NOT lists and QV-eligible list to codebook. Added Rule 2b, Rule 6, 4 new borderline cases. Prompt update pending. |
+
+When the prompt changes (after pilot testing, rubric revision, etc.), bump the version and log what changed. Every annotation record carries `promptVersion` so we can filter/compare.
+
+---
+
+## Iterative Prompt Tuning Protocol
+
+The v1.0 system prompt is built from theory and synthetic examples. Before firing the full 50K run, we iterate on real data to find and fix failure modes while it costs cents, not dollars.
+
+### Phase 0: Seed sample (before extraction is ready)
+
+Grab 20-30 real Item 1C paragraphs manually from EDGAR full-text search (`efts.sec.gov/LATEST/search-index?q="Item 1C" cybersecurity`). Paste into a JSONL by hand. This lets prompt tuning start immediately while extraction code is still being built.
+
+### Phase 1: Micro-pilot (30 paragraphs, all 3 Stage 1 models)
+
+1. Select ~30 real paragraphs covering:
+ - At least 2 per content category (incl. None/Other)
+ - At least 2 per specificity level
+ - Mix of industries and filing years
+ - 5+ deliberately tricky borderline cases
+
+2. Run all 3 Stage 1 models on these 30 with prompt v1.0.
+
+3. **You and at least one teammate independently label the same 30** using the codebook. These are your reference labels.
+
+4. Compare:
+ - Per-model accuracy vs reference
+ - Inter-model agreement (where do they diverge?)
+ - Per-category confusion (which categories do models mix up?)
+ - Per-specificity bias (do models systematically over/under-rate?)
+ - Are confidence ratings calibrated? (Do "high" labels match correct ones?)
+
+5. **Identify failure patterns.** Common ones:
+ - Models gravitating to "Risk Management Process" (largest category — pull)
+ - Models rating specificity too high (any named entity → firm-specific)
+ - Board Governance / Management Role confusion
+ - Missing None/Other (labeling boilerplate as Strategy Integration)
+
+### Phase 2: Prompt revision (v1.1)
+
+Based on Phase 1 failures, revise the system prompt:
+- Add "common mistakes" section with explicit corrections
+- Add few-shot examples for confused categories
+- Sharpen boundary rules where models diverge
+- Add negative examples ("This is NOT Incident Disclosure because...")
+
+**Do not change the Zod schema or category definitions** — only the system prompt text. Bump to v1.1. Re-run the same 30 paragraphs. Compare to v1.0.
+
+### Phase 3: Scale pilot (200 paragraphs)
+
+1. Extract 200 real paragraphs (stratified, broader set of filings).
+
+2. Run all 3 Stage 1 models with the best prompt version.
+
+3. Compute:
+ - **Inter-model Fleiss' Kappa** on category: target ≥ 0.65
+ - **Inter-model Spearman correlation** on specificity: target ≥ 0.70
+ - **Consensus rate**: % with 2/3+ agreement on both dims. Target ≥ 75%.
+ - **Confidence calibration**: are "high confidence" labels more likely agreed-upon?
+
+4. If targets not met:
+ - Analyze disagreements — genuine ambiguity or prompt failure?
+ - Prompt failure → revise to v1.2, re-run
+ - Genuine ambiguity → consider rubric adjustment (merge categories, collapse specificity)
+ - Repeat until targets met or documented why they can't be
+
+5. **Cost check**: extrapolate from 200 to 50K. Reasoning token usage reasonable?
+
+### Phase 4: Green light
+
+Once scale pilot passes:
+- Lock prompt version (no changes during full run)
+- Lock model configuration (reasoning effort, temperature)
+- Document final prompt, configs, and pilot results
+- Fire the full 50K annotation run
+
+---
+
+## Pipeline Reliability & Observability
+
+### Resumability
+
+All API-calling scripts (annotation, judging, benchmarking) use the same pattern:
+
+1. Load output JSONL → parse each line → collect completed paragraph IDs into a Set
+2. Lines that fail `JSON.parse` are skipped (truncated from a crash)
+3. Filter input to only paragraphs NOT in the completed set
+4. For each completion, append one valid JSON line + `flush()`
+
+JSONL line-append is atomic on Linux. Worst case on crash: one truncated line, skipped on reload. No data loss, no duplicate work, no duplicate API spend.
+
+### Error Handling
+
+| Error Type | Examples | Strategy |
+|------------|----------|----------|
+| Transient | 429, 500, 502, 503, ECONNRESET, timeout | Exponential backoff: 1s→2s→4s→8s→16s. Max 5 retries. |
+| Permanent | 400, 422 (bad request) | Log to `{output}-errors.jsonl`, skip |
+| Validation | Zod parse fail on LLM response | Retry once, then log + skip |
+| Budget | 402 (out of credits) | Stop immediately, write session summary, exit |
+| Consecutive | 10+ errors in a row | Stop — likely systemic (model down, prompt broken) |
+
+Error paragraphs get their own file. Retry later with `--retry-errors`.
+
+### Graceful Shutdown (SIGINT/SIGTERM)
+
+On Ctrl+C:
+1. Stop dispatching new work
+2. Wait for in-flight requests to complete (already paid for)
+3. Write session summary
+4. Print final stats, exit 0
+
+### Live Dashboard (stderr)
+
+Updates every second:
+
+```
+ SEC-cyBERT │ label:annotate │ google/gemini-3.1-flash-lite-preview │ v1.1
+ ─────────────────────────────────────────────────────────────────────────
+ Progress 12,847 / 50,234 (25.6%) ETA 42m 18s
+ Session $1.23 │ 38m 12s elapsed │ 337.4 para/min
+ Totals $4.56 all-time │ 3 errors (0.02%) │ 7 retries
+ Latency p50: 289ms │ p95: 812ms │ p99: 1,430ms
+ Reasoning avg 47 tokens/para │ 12.3% of output tokens
+```
+
+Goes to stderr so stdout stays clean.
+
+### Session Log
+
+Every run appends to `data/metadata/sessions.jsonl`:
+
+```json
+{
+ "sessionId": "a1b2c3d4",
+ "command": "label:annotate",
+ "modelId": "google/gemini-3.1-flash-lite-preview",
+ "stage": "stage1",
+ "promptVersion": "v1.1",
+ "startedAt": "2026-03-29T10:00:00Z",
+ "endedAt": "2026-03-29T10:38:12Z",
+ "durationSeconds": 2292,
+ "paragraphsTotal": 50234,
+ "paragraphsProcessed": 12847,
+ "paragraphsSkippedResume": 37384,
+ "paragraphsErrored": 3,
+ "costUsd": 1.23,
+ "reasoningTokensTotal": 482000,
+ "avgLatencyMs": 450,
+ "p95LatencyMs": 812,
+ "throughputPerMinute": 337.4,
+ "concurrency": 12,
+ "exitReason": "complete"
+}
+```
+
+`exitReason`: `complete` | `interrupted` (Ctrl+C) | `budget_exhausted` (402) | `error_threshold` (consecutive limit)
+
+### OpenRouter Generation ID
+
+Every annotation record includes the OpenRouter `generationId` from the response `id` field. This enables:
+- **Audit trail**: look up any annotation on OpenRouter's dashboard
+- **Rich stats**: `GET /api/v1/generation?id={generationId}` returns latency breakdown, provider routing, native token counts
+- **Dispute resolution**: if a label looks wrong, inspect the exact generation that produced it
+
+---
+
+## Gold Set Protocol
+
+### Sampling (1,200 paragraphs minimum)
+
+Stratify by:
+- Content category (all 7 represented, oversample rare categories)
+- Specificity level (all 4 represented)
+- GICS sector (financial services, tech, healthcare, manufacturing minimum)
+- Filing year (FY2023 and FY2024)
+
+### Human Labeling Process
+
+Labeling is done through a purpose-built web tool that enforces quality:
+1. **Rules quiz:** Every annotator must read the codebook and pass a quiz on the rules before each labeling session. The quiz tests the three most common confusion axes: Management Role vs RMP (person-vs-function test), materiality disclaimers (Strategy Integration vs None/Other), and QV fact counting.
+2. **Warm-up:** First 5 paragraphs per session are warm-up (pre-labeled, with feedback). Not counted toward gold set.
+3. **Independent labeling:** Three team members independently label the full gold set using this codebook.
+4. Compute inter-rater reliability:
+ - Cohen's Kappa (for content category — nominal, pairwise)
+ - Krippendorff's Alpha (for specificity level — ordinal, all annotators)
+ - Per-class confusion matrices
+ - **Target: Kappa > 0.75, Alpha > 0.67**
+5. Adjudicate disagreements: third annotator tiebreaker, or discussion consensus with documented rationale
+6. Run the full GenAI pipeline on the gold set and compare to human labels
+
+### If Agreement Is Poor
+
+- If Kappa < 0.60 on any category pair: revise that category's definition and boundary rules, re-pilot
+- If Alpha < 0.50 on specificity: collapse 4-point to 3-point scale (merge 1+2 into "Non-specific" or 3+4 into "Substantive")
+- Document the collapse decision and rationale in this codebook
diff --git a/docs/NARRATIVE.md b/docs/NARRATIVE.md
new file mode 100644
index 0000000..87b1caa
--- /dev/null
+++ b/docs/NARRATIVE.md
@@ -0,0 +1,449 @@
+# Project Narrative — SEC Cybersecurity Disclosure Quality Classifier
+
+This document captures the process, roadblocks, decisions, and resolutions from building the SEC cybersecurity disclosure quality classifier. It serves as the source material for the final paper and presentation.
+
+---
+
+## Phase 1: Project Scoping and Construct Design
+
+### The Problem
+
+SEC Release 33-11216 (July 2023) created a new annual cybersecurity disclosure requirement (10-K Item 1C) and an incident disclosure requirement (8-K Item 1.05). By FY2024, ~9,000-10,000 filings exist. No validated classifier or public labeled dataset exists for assessing the quality of these disclosures. Investors, regulators, and compliance officers need scalable tools to distinguish substantive disclosures from boilerplate.
+
+### Methodology Decision: Ringel (2023) "Synthetic Experts"
+
+We adopted the Ringel (2023) "Synthetic Experts" pipeline: use frontier LLMs to generate training labels at scale, then distill into an efficient encoder model. This approach was chosen because:
+- Manual labeling of 50,000+ paragraphs is infeasible for a 6-person team
+- Multiple cheap LLMs annotating in parallel provide built-in quality control through inter-annotator agreement
+- The encoder distillation step produces a model that can classify at inference time without LLM API costs
+
+### Construct: Two Classification Dimensions
+
+We defined two simultaneous classification tasks per paragraph:
+1. **Content Category** (7 mutually exclusive classes) — what the paragraph is about, grounded in the SEC rule's own structure (Board Governance, Management Role, Risk Management Process, Third-Party Risk, Incident Disclosure, Strategy Integration, None/Other)
+2. **Specificity Level** (4-point ordinal) — how company-specific the disclosure is, from generic boilerplate to quantified-verifiable facts
+
+The construct maps to NIST CSF 2.0 categories for academic grounding.
+
+---
+
+## Phase 2: Data Acquisition and Corpus Construction
+
+### The Extraction Problem
+
+SEC filings are not structured data. They're HTML generated from PDFs, XBRL, and Word documents by dozens of different tools, each producing different artifacts. Building a reliable extraction pipeline for ~9,000 filings meant solving a series of messy, real-world data engineering problems.
+
+### Pipeline Architecture
+
+Built in TypeScript (~1,000 lines of extraction code across `parse-item1c.ts`, `segment.ts`, `fast-reparse.ts`, and pipeline orchestration):
+
+```
+EDGAR Master Index → enumerate 10-K filings → download HTML → extract Item 1C → segment paragraphs → JSONL
+submissions.zip → scan for 8-K Item 1.05 → download HTML → extract → segment → merge with 10-K corpus
+```
+
+### Roadblock: HTML Variability
+
+Every filing's HTML is different. The same logical content looks completely different depending on the tool that generated the HTML:
+
+- **Word splitting from inline elements.** XBRL and styling tags break words mid-token: `Item 2` renders correctly in a browser but parses as "Item2" in code. Same with `cybersecurity`. Required detecting adjacent inline element boundaries and inserting spaces selectively.
+
+- **CamelCase joins from PDF converters.** PDF-to-HTML tools merge sentences across formatting boundaries: `sentence.Next sentence` instead of `sentence. Next sentence`. Required regex passes to detect missing spaces after punctuation.
+
+- **Page breaks mid-sentence.** Page numbers (`28`, `- 12 -`, `F-3`), running headers (`ACME CORP — ANNUAL REPORT`), and subsidiary headers (`ENTERGY ARKANSAS, LLC AND SUBSIDIARIES`) get spliced into the middle of content paragraphs. Required filtering a catalog of page artifact patterns.
+
+- **Table of Contents shadowing.** "Item 1C" appears at least twice in every 10-K — once in the Table of Contents and once in the actual content. Using the first match extracts the wrong section. Took several iterations to discover we needed the LAST match — this was a silent failure that produced empty or wrong extractions for hundreds of filings before we caught it.
+
+- **XBRL tag pollution.** Inline XBRL wraps financial facts in `ix:header`, `ix:references`, and `ix:nonFraction` tags that carry no display content but add noise. Required stripping all `ix:*` tags before text processing.
+
+- **Entity encoding chaos.** ` `, ` `, `“`, `”`, `—`, `–`, `•` — each needs correct decoding, and different filing tools use different entity styles for the same characters.
+
+### Paragraph Segmentation
+
+After extracting clean section text, splitting into paragraphs had its own challenges:
+
+- **Bullet list merging.** Disclosures frequently use bullet lists ("Our program includes: • risk assessment • vulnerability scanning"). Bullets need to be merged with their intro sentence; a standalone "• vulnerability scanning" is meaningless.
+- **Continuation line detection.** Sentences split across HTML block elements need rejoining. Heuristic: if the previous block lacks terminal punctuation and the next starts lowercase or with a continuation phrase (`and`, `or`, `including`, `such as`), merge.
+- **Length boundaries.** Under 20 words → likely a header (filtered). Over 500 words → split at sentence boundaries to keep annotation units manageable.
+
+### 8-K Extraction
+
+**Roadblock: EDGAR full-text search misses filings.** The EFTS keyword search doesn't reliably return all cybersecurity 8-Ks. Post-May 2024, companies moved non-material disclosures from Item 1.05 to Items 8.01 or 7.01.
+
+**Resolution:** Built `scan-8k-items.py` to scan the SEC's bulk `submissions.zip` deterministically — a gap-free scan of every 8-K with cybersecurity content. Tries items in priority order (1.05 → 8.01 → 7.01), skips cross-reference stubs. Result: **207 cybersecurity incident 8-K filings** identified — a complete inventory.
+
+### Paragraph Deduplication
+
+Each paragraph gets a `textHash` (SHA-256 of normalized text). Deduplication at three levels:
+
+1. **Within-filing:** Parser artifacts sometimes produce duplicate blocks. Removed by textHash.
+2. **Cross-year (same company):** Companies copy-paste identical paragraphs year-to-year. Detected but kept — the repetition itself is informative for disclosure quality analysis.
+3. **Cross-company boilerplate:** Different companies use identical materiality disclaimers. Detected but kept — these are real Specificity 1 examples.
+
+**Result:** Only ~27 excess duplicates removed (0.04%). Most textual similarity is legitimate variation.
+
+### Performance at Scale
+
+Initial extraction with cheerio (DOM parser) was slow for 9,000 filings. Built `fast-reparse.ts` (regex-only HTML stripping, no DOM) and `parallel-reparse.ts` (16 bun workers in parallel). Also deduplicates amendment filings (keeps latest per CIK×FiscalYear).
+
+### Corpus Statistics
+
+- **72,045 paragraphs** from ~9,000 filings (FY2023 + FY2024 + early FY2025)
+- All 10-K Item 1C; 207 8-K paragraphs extracted separately
+- Median ~7 paragraphs per filing
+- 49,795 paragraphs annotated (after filtering to complete filing metadata)
+
+### Roadblock: Truncated Filings
+
+Discovered 72 filings (~0.8%) where section boundary detection cut off mid-sentence. A paragraph about CISSP certifications cut mid-sentence looks like vague boilerplate — this would corrupt specificity labels.
+
+**Resolution:** Exclude from training splits. Filings where the last paragraph doesn't match `/[.!?;")\u201d]\s*$/` are filtered before train/val/test creation.
+
+---
+
+## Phase 3: Codebook Development
+
+### Initial Codebook (v1.0)
+
+Built a detailed labeling codebook (`docs/LABELING-CODEBOOK.md`) grounded in the SEC rule structure. Includes:
+- 7 category definitions with SEC basis citations, key markers, and example texts
+- 4 specificity levels with boundary rules
+- 5 category decision rules for common ambiguities
+- 5 borderline cases with worked reasoning
+- Gold set protocol for human validation
+
+### Codebook Iteration (v3.0 — 2026-03-29)
+
+After analyzing 150,000+ Stage 1 annotations and identifying systematic disagreement patterns, we made three major codebook rulings:
+
+**Ruling A — Materiality Disclaimers:** Paragraphs with explicit materiality assessments ("have not materially affected our business strategy, results of operations, or financial condition") are Strategy Integration, even if boilerplate. A cross-reference to Risk Factors appended to a materiality assessment does not change the classification. Only pure cross-references with no materiality conclusion are None/Other. *This resolved ~1,094 disputed paragraphs.*
+
+**Ruling B — SPACs and Shell Companies:** Companies explicitly stating they have no operations, no cybersecurity program, or no formal processes receive None/Other regardless of incidental mentions of board oversight or risk acknowledgment. The absence of a program is not a description of a program. *This resolved ~53 unresolved paragraphs and likely hundreds more.*
+
+**Ruling C — Person vs. Function Test (Management Role vs. RMP):** This was the single most impactful ruling, addressing the #1 disagreement axis (2,290 disputes). The line: if the paragraph is about the *person* (qualifications, credentials, background, tenure, career history) → Management Role. If it's about what the role/program *does* (processes, activities, tools, frameworks) → Risk Management Process, even if a CISO/CIO/CTO title appears. The test: would the paragraph still make sense if you removed the person's name, title, and credentials? If yes → the paragraph is about the function, not the person.
+
+---
+
+## Phase 4: Stage 1 — Synthetic Expert Annotation
+
+### Tech Stack Decision
+
+Chose TypeScript + Vercel AI SDK v6 + OpenRouter over Python + LangChain/LiteLLM because:
+- Vercel AI SDK provides native structured output with Zod schema validation
+- OpenRouter gives single-API access to all candidate models with real cost tracking
+- Bun runtime for fast script execution with native TypeScript support
+- JSONL-append pattern for crash-safe resume without data loss or duplicate API spend
+
+### Prompt Engineering (12+ iterations, v1.0 → v2.5)
+
+This was one of the most time-intensive phases. Key lessons:
+
+**What worked:**
+- Text enum labels ("Firm-Specific") over ordinals ("3") — universal improvement across all models
+- Decision-test format ("ask in order, stop at first yes") for specificity — reduced ambiguity
+- ✓ IS / ✗ NOT fact lists with explicit examples — the single biggest lever for specificity accuracy. Reduced overrating from 54 to 21 cases.
+- Validation step ("review your specific_facts, remove NOT-list items") — caught model self-correction
+- 13 calibration examples, each targeting a specific observed failure mode — examples outperformed rules
+- Explicit Incident↔Strategy tiebreaker — completely eliminated a 20-case confusion pattern
+- `specific_facts` chain-of-thought in the schema — forces the model to enumerate evidence before assigning specificity
+
+**What didn't work:**
+- Adding more rules (v1.2) — confused models, caused regression from 95%→88% category accuracy
+- Changing category definitions to structural "TEST:" format (v2.6) — regression
+- "COMMON MISTAKES" section (v2.7) — improved consensus but reduced unanimity
+- Attempting a Management↔RMP tiebreaker in the prompt (v2.5) — made confusion worse (this was ultimately resolved through the v3.0 codebook ruling instead)
+
+**Critical lesson: 40-sample pilots were misleadingly optimistic.** Results that looked good at n=40 fell apart at n=500. We standardized on 500-sample pilots for all prompt evaluation.
+
+### The Iteration Trajectory
+
+Five 40-sample pilots (v1.0, v1.1, v1.2, v2.1, v2.2-n40) followed by six 500-sample pilots (v2.2-v2.7):
+
+| Version | n | Both Unan | Key Change | Top Confusion Axis |
+|---------|---|-----------|-----------|-------------------|
+| v2.2 | 500 | 51.4% | First 500-sample baseline | Incident↔Strategy (20 cases) |
+| v2.3 | 500 | 59.2% | Tightened Sector-Adapted, expanded IS/NOT lists | Inc↔Strat reduced |
+| v2.4 | 500 | 66.8% | Validation step, schema constraint on specific_facts | Mgmt↔RMP emerging |
+| **v2.5** | **500** | **70.8%** | Incident↔Strategy tiebreaker, QV calibration examples | **Inc↔Strat eliminated**; Mgmt↔RMP now #1 (17 cases) |
+| v2.6 | 500 | 67.8% | Changed defs to "TEST:" format — **regression** | — |
+| v2.7 | 500 | 67.6% | Added COMMON MISTAKES section — **regression** | — |
+
+The most dramatic single improvement: v2.5's Incident↔Strategy tiebreaker ("DESCRIBES what happened → Incident; ONLY discusses cost/materiality → Strategy") completely eliminated what had been the #1 confusion axis at v2.2 (20 cases → 0). This is a case where a single well-targeted rule outperformed broad prompt restructuring.
+
+v2.5 was locked as the production prompt. v2.6 and v2.7 demonstrated that the prompt had reached its practical ceiling — further structural changes caused regressions. The remaining disagreements (Management↔RMP, specificity boundaries) turned out to be codebook ambiguities and model-capacity issues, not prompt failures.
+
+### The Original Panel and the Nano Problem
+
+The initial Stage 1 panel was:
+- `google/gemini-3.1-flash-lite-preview`
+- `openai/gpt-5.4-nano`
+- `x-ai/grok-4.1-fast`
+
+GPT-5.4-nano was chosen for its low cost and the assumption that even a small model could handle structured classification with a good enough prompt. This assumption was wrong.
+
+**The problem: nano wasn't thinking.** During pilot testing, we discovered nano produced **zero reasoning tokens 64% of the time**. When it did reason, the output was minimal (avg 34,356 total reasoning tokens across 500 paragraphs, vs grok's 336,993). Without reasoning, nano's classifications were essentially pattern-matching on surface features — it couldn't apply the multi-step decision logic the codebook requires (enumerate facts, filter against IS/NOT lists, count QV-eligible items, apply threshold).
+
+**The symptoms:**
+- **Erratic specificity** — nano was simultaneously too conservative on some axes ([1,3,3] disagreements — 21 cases where nano said Generic when gemini+grok said Firm-Specific) and too liberal on others ([3,3,4] — 11 cases where nano said Quantified when the others said Firm-Specific). No prompt change fixed this because it's a model-level capacity issue: without reasoning tokens, the decision test can't execute properly.
+- **Lowest pairwise agreement** — gemini×grok agreed on 95.6% of categories and 91.2% of specificity. gemini×nano: 87.4% category, 83.8% specificity. Nano was the consistent outlier.
+- **Dragging down unanimity** — the gemini+grok pair was strong, but nano's disagreements broke unanimity on hundreds of paragraphs that would otherwise have been clean.
+
+Despite 12 prompt iterations (v1.0→v2.7) that improved overall metrics significantly, nano's behavior never stabilized. The prompt was at its practical ceiling for a model that wouldn't reason.
+
+### Smoke Testing: model-probe.ts
+
+Before running an expensive benchmark, we built `model-probe.ts` to test 9 candidate models on a single paragraph for basic structured output compliance:
+- gemini-3.1-flash-lite-preview, grok-4.1-fast, gpt-4.1-mini, gpt-4.1-nano, claude-haiku-4.5, gemini-3.1-flash-preview, deepseek-chat-v3-0324:free, llama-4-maverick, qwen3-235b-a22b
+
+This caught schema-level incompatibilities (wrong field names, missing fields, invalid enum values) before we spent money on 500-paragraph bench runs.
+
+### Model Benchmark: 6 Candidates to Replace Nano
+
+After locking prompt v2.5, we built `model-bench.ts` to formally evaluate nano replacements. Each candidate was benchmarked against the 500-sample pilot set and compared to the existing gemini+grok annotations.
+
+| Model | Cost/ann | Reasoning Tokens | vs Majority (both) | Cat Outlier | Spec Outlier | Nano→X Delta |
+|-------|----------|-----------------|---------------------|-------------|-------------|-------------|
+| seed-2.0-lite | $0.00227 | 658 | **88.8%** | 2.2% | 3.8% | +11.6pp |
+| **mimo-v2-flash** | **$0.00048** | **1,346** | **86.0%** | **5.0%** | **4.0%** | **+8.8pp** |
+| glm-4.5-air | $0.00136 | 854 | 76.2% | 8.8% | 9.6% | +0.8pp |
+| minimax-m2.5 | $0.00106 | 590 | 73.8% | 7.9% | 12.7% | -1.0pp |
+| mistral-small-2603 | $0.00015 | **0** | 66.8% | 9.2% | 17.6% | -6.8pp |
+| nemotron-3-super-120b | $0.00152 | 942 | 57.9% | **21.3%** | **20.7%** | **-16.9pp** |
+
+**Key findings:**
+
+- **Reasoning tokens are the strongest predictor of accuracy.** Mistral-small produced literally zero reasoning tokens — not a single one. Its average output was only 136 tokens (vs mimo's 1,463). It had a 17.6% specificity outlier rate. This confirmed that the nano problem wasn't prompt-specific: models that don't reason can't do this task.
+
+- **Price ≠ quality.** Nemotron was the most expensive candidate at $0.00152/annotation with 942 reasoning tokens (it *was* thinking), but thinking badly — 21.3% category outlier rate, worst of any candidate. Only 497/500 completed (3 failures). Replacing nano with nemotron would have been catastrophic: -16.9pp unanimity.
+
+- **The two mediocre options.** GLM-4.5-air (+0.8pp) and minimax-m2.5 (-1.0pp) neither helped nor hurt. Not worth the switch.
+
+- **Seed-2.0-lite was technically the best** at 88.8% agreement with majority, but cost 4.7x more than mimo ($0.00227 vs $0.00048) and was 2x slower (21.5s vs 11.4s latency). For 50K+ paragraphs at scale, this cost differential was significant.
+
+### The Winner: mimo-v2-flash
+
+Mimo won the slot on value:
+1. **Cheapest viable option** — $0.00048/annotation (3x cheaper than most candidates)
+2. **Most reasoning tokens** — 1,346 avg (highest of all 6, more than seed-2.0-lite)
+3. **Lowest outlier rate** — 5.0% category, 4.0% specificity
+4. **+8.8pp unanimity improvement** over nano
+5. **93.4% category agreement with grok** — strongest pairwise alignment of any candidate
+
+**Roadblock: Mimo schema quirks.** Mimo produced non-standard outputs: capitalized confidence labels ("High" instead of "high"), numeric confidence values (0.9 instead of "high"), and flat string arrays instead of structured `{fact, type}` objects for specific_facts. Rather than trying to fix this with prompting (which would waste tokens and might break other behavior), we fixed it with Zod schema transforms — `.transform()` to normalize casing and map numbers to labels, `.union()` to accept both structured and flat fact formats. This took ~30 minutes to implement and handled all edge cases automatically.
+
+A dedicated `mimo-pilot.ts` script modeled the full "replace nano with mimo" scenario before committing to the panel change.
+
+**Final Stage 1 panel:**
+- `google/gemini-3.1-flash-lite-preview`
+- `xiaomi/mimo-v2-flash` ← replaced `openai/gpt-5.4-nano`
+- `x-ai/grok-4.1-fast`
+
+### Production Run Results
+
+Completed 2026-03-28. **150,009 annotations** (50,003 paragraphs × 3 models), **$115.88 total cost**, **0 failures**.
+
+| Metric | Value |
+|--------|-------|
+| Both-unanimous | 35,204 (70.7%) |
+| Majority agreement | 14,182 (28.5%) |
+| Unresolved (3-way split) | 409 (0.8%) |
+| Total cost | $115.88 |
+| Failures | 0 |
+
+---
+
+## Phase 5: Post-Stage 1 Analysis — Discovering Systematic Patterns
+
+After the production run, we conducted a deep distributional analysis of disagreement patterns. This analysis fundamentally changed our approach to Stage 2.
+
+### Model Bias Discovery
+
+Each model has systematic, quantifiable biases:
+
+| Model | Category Outlier Rate | Specificity Outlier Rate | Key Bias |
+|-------|----------------------|--------------------------|----------|
+| Mimo | **48.1%** | 32.5% | Over-classifies as Third-Party Risk; under-rates Spec 4 (74.3% of Spec 4 outlier cases) |
+| Gemini | 30.9% | **45.7%** | Over-classifies as Management Role (81.1% in Mgmt↔RMP disputes); inflates specificity |
+| Grok | 21.0% | 21.8% | Most moderate; slight RMP bias |
+
+These biases are not random — they're predictable by model and confusion axis. This opened the possibility of model-calibrated majority voting (using the known biases to assess when the majority is likely correct).
+
+### Key Distributional Findings
+
+1. **Management Role is the disaster category** — only 51.5% unanimous (every other category is 62-79%). Nearly half of all Management Role paragraphs need resolution.
+2. **Spec 4 (Quantified-Verifiable) is the disaster specificity** — only 37.6% unanimous. Models can't agree on what counts as "quantified."
+3. **Stage 1 confidence is completely useless** — 95.4% of paragraphs report all-high category confidence. Zero all-low cases. The cheap models are systematically overconfident.
+4. **Specificity is effectively a 3-level scale** — Spec 2 (Sector-Adapted) is rarely disputed (82.1% unanimous). The contested boundaries are [1,3] (3,742 disputes) and [3,4] (2,898 disputes) with almost nothing at [1,2] or [2,3].
+5. **Longer paragraphs are harder** — Q5 word count (>134 words): 64.1% unanimous vs Q1 (≤51 words): 76.3%.
+6. **Small companies (1-3 paragraphs) are noise-prone** — 50.0% unanimous, 10.5% unresolved. Almost all are SPACs or shell companies with non-standard disclosures.
+
+### Top Disagreement Axes
+
+| Axis | Disputes | Pattern |
+|------|----------|---------|
+| Management Role ↔ RMP | 2,290 | Paragraph describes processes but names CISO/CIO |
+| RMP ↔ Third-Party Risk | 1,475 | Mimo over-classifies vendor mentions as Third-Party |
+| None/Other ↔ Strategy Integration | 1,094 | Materiality disclaimers — genuinely ambiguous in codebook |
+| Board Governance ↔ Management Role | 867 | Paragraphs at the board-management interface |
+| Spec [1,3] boundary | 3,742 | NOT-list items counted as specific facts |
+| Spec [3,4] boundary | 2,898 | Gemini counts roles as QV-eligible; Mimo downgrades |
+
+### Insight: Reading the Actual Paragraphs
+
+We sampled 20 paragraphs across the 4 hardest dispute types and read them in full. Patterns emerged:
+
+- **Management↔RMP:** Every example follows the same structure — a process-focused paragraph that names a CISO/CIO in the opening attribution. The paragraph's content is about what the program does, not who the person is. The v3.0 "person-vs-function" ruling directly addresses this.
+- **None/Other↔Strategy:** All 5 sampled paragraphs are "no material incidents" boilerplate. Every single one. The materiality disclaimer ruling resolves this entirely.
+- **Spec [3,4]:** Gemini counts "20 years of experience" + "CISO" as 2 QV facts → Spec 4. Grok/Mimo correctly exclude named roles from QV counting → Spec 3. The rule exists in the prompt but Gemini ignores it.
+- **Small company unresolved:** All SPACs or blank check companies with "we have no operations" disclaimers. The SPAC ruling handles these.
+
+---
+
+## Phase 6: Stage 2 — Judge Model Evaluation
+
+### Gold Label Construction
+
+Built a 50-paragraph gold set using 3 independent Sonnet agents:
+- Agent A: paragraphs 0-24
+- Agent B: paragraphs 25-49
+- Agent C: all 50 as cross-check
+- Adjudicator agent resolved 11 disputes with detailed reasoning
+- Inter-annotator agreement: 94% category, 84% specificity, 78% both
+
+**Lesson learned: majority vote ≠ ground truth.** Initially scored judges against Stage 1 majority, which made gemini-3-flash look great (86% category match). Scoring against gold labels revealed it added zero value — it was rubber-stamping the majority. Always evaluate against adjudicated gold labels.
+
+### Judge Model Benchmarking (8 candidates)
+
+| Model | Mode | n | Cat | Spec | Both | Fails | Cost/call |
+|-------|------|---|-----|------|------|-------|-----------|
+| Majority vote | — | 50 | 78.0% | 80.0% | 60.0% | 0% | $0 |
+| gpt-5.4-mini | structured | 50 | 88.0% | 80.0% | 68.0% | 0% | $0.0046 |
+| GLM-5 v2 | structured | 48 | 87.5% | 89.6% | 77.1% | 4% | $0.0078 |
+| GLM-5 v4 | structured+req_params | 44 | 90.9% | 88.6% | 79.5% | 12% | $0.0083 |
+| GLM-5 v3 | tool calling | 50 | 84.0% | 82.0% | 72.0% | 0% | $0.0070 |
+
+### Roadblock: GLM-5 Structured Output Failures
+
+GLM-5 had the best accuracy (77-80% both-correct) but a 6-12% structured output failure rate. The model intermittently wraps JSON in markdown code blocks.
+
+**Investigation:** Built diagnostic scripts (`judge-diag.ts`, `judge-diag-batch.ts`) to isolate the issue. Tested all 9 failing paragraphs × 2 attempts each. Found 72% success rate, all from the same model variant (`z-ai/glm-5-20260211`). The best OpenRouter provider (Ambient) has a 6% base error rate. This is a model-level behavior, not provider-specific.
+
+**Attempted fixes:**
+- Bumped validation retries from 1 to 3 → reduced failures from 18% to ~4-12%
+- Tool calling mode → 0% failures but accuracy dropped ~7pp (72% both). Enum constraints not enforced, `undefined` categories appear.
+- `provider: { require_parameters: true }` in OpenRouter → no effect
+- Exacto routing → no effect
+
+**Resolution:** Accepted as a model-level constraint. Production strategy will use the best model with retry logic and fall back to a reliable model (gpt-5.4-mini) for persistent failures.
+
+### Judge Prompt Iteration (v1 → v2)
+
+Built a dynamic judge prompt (`buildJudgePrompt()`) with:
+- **Disagreement diagnosis:** Tells the judge exactly what's in dispute and the vote distribution
+- **Targeted disambiguation rules:** 7 category guidance blocks + 2 specificity guidance blocks, dynamically included only when relevant to the specific dispute
+- **Structured analysis steps:** Critique each annotator → enumerate IS-list facts → determine dominant purpose → decide
+- **Confidence calibration:** HIGH/MEDIUM/LOW mapped to codebook clarity, used as training weights
+- **Anti-bias:** Fisher-Yates shuffle of annotator order
+
+**Results:** Category accuracy improved +10pp over majority vote for both models. Specificity improved +9.8pp for GLM-5 but stayed flat for gpt-5.4-mini. The disambiguation rules work well for category but specificity needs the codebook v3.0 changes.
+
+### Key Finding: Judge Confidence Is Highly Predictive
+
+| Confidence | GLM-5 Both-Correct | gpt-5.4-mini Both-Correct |
+|------------|--------------------|----|
+| High | 82-84% | 80.6% |
+| Medium | 25-50% | 35.7% |
+
+This enables confidence-stratified training data: high-confidence judge labels get full weight; medium/low are downweighted or excluded.
+
+---
+
+## Phase 7: Revised Data Quality Strategy (Current)
+
+The post-Stage 1 analysis and judge benchmarking led to a fundamental reassessment of our approach.
+
+### The Key Realization
+
+The best judge (77% both-correct) barely beats the raw majority vote (78% category, 80% specificity). Judging all 14,591 disputed paragraphs at 77% accuracy doesn't meaningfully improve on the majority. The judge's real value is concentrated in two places:
+1. The 409 unresolved paragraphs where no majority exists
+2. Cases where we have specific reason to doubt the majority
+
+### The Revised Plan
+
+**Phase 0: Codebook rulings (completed)** — Three rulings that resolve thousands of disputes at zero inference cost: materiality disclaimers → Strategy Integration, SPACs → None/Other, person-vs-function test for Management↔RMP.
+
+**Phase 1: Model-calibrated majority resolution** — For the 14,182 majority-agreement paragraphs, apply calibration using known model biases. When the known-biased model is the outlier on a known axis → trust majority. Flag anomalous cases for judge resolution. Expected to auto-resolve ~10,000-12,000 paragraphs.
+
+**Phase 2: Human gold set (1,200 paragraphs)** — Assignment requires 1,200 human-labeled paragraphs. Building a quiz-gated labeling web tool that enforces codebook knowledge before each session. Stratified sampling to ensure all categories, specificity levels, and confusion axes are represented. This becomes the calibration metric for all further work.
+
+**Phase 3: Judge prompt iteration** — Update judge prompt to mirror codebook v3.0 rulings. Add worked examples from the 11 gold adjudications. Iterate against expanded gold set. Target: 85%+ both-correct.
+
+**Phase 4: Production judge run** — Judge only the ~3,000-5,000 genuinely hard cases (unresolved + flagged majority + "both" disputes). Two models for cross-validation on the hardest cases.
+
+**Phase 5: Training data assembly** — Confidence-stratified tiers:
+
+| Tier | Source | Est. Accuracy | Paragraphs | Treatment |
+|------|--------|--------------|------------|-----------|
+| T1 | Both-unanimous | ~97% | 35,204 | Full weight |
+| T2 | Calibrated majority | ~85-90% | ~9,000-12,000 | Full weight |
+| T3 | Judge high-confidence | ~84% | ~2,000-3,000 | Full weight |
+| T4 | Judge medium-confidence | ~40% | ~500-1,000 | Downweight (0.5) or soft labels |
+| T5 | Judge low / failure / excluded | ??? | ~500-1,000 | Exclude |
+
+Expected total: ~46,000-48,000 paragraphs at ~93-95% label accuracy.
+
+---
+
+## Running Cost Ledger
+
+| Phase | Cost | Notes |
+|-------|------|-------|
+| Stage 1 production run | $115.88 | 150,009 annotations, 0 failures |
+| Stage 1 prompt iteration (pilots) | ~$15 | 12+ versions × 500-sample pilots |
+| Judge benchmarking | ~$5 | 8 models × 50-sample gold set |
+| Judge prompt iteration | ~$3 | Ongoing |
+| **Total to date** | **~$139** | |
+
+---
+
+## Key Technical Artifacts
+
+| Artifact | Location | Description |
+|----------|----------|-------------|
+| Labeling codebook | `docs/LABELING-CODEBOOK.md` | Authoritative reference, v3.0 with codebook rulings |
+| Stage 1 annotations | `data/annotations/stage1.jsonl` | 150,009 annotations (120 MB) |
+| Paragraphs | `data/paragraphs/paragraphs-clean.jsonl` | 72,045 paragraphs with filing metadata |
+| Gold labels | `data/bench/judges/gold-final.json` | 50 adjudicated gold labels |
+| Gold adjudications | `data/bench/judges/gold-adjudicated.json` | 11 detailed adjudication decisions with reasoning |
+| Stage 1 prompt | `ts/src/label/prompts.ts` | SYSTEM_PROMPT (v2.5) + buildJudgePrompt() |
+| Annotation runner | `ts/scripts/stage1-run.ts` | Resume-safe, configurable concurrency |
+| Analysis scripts | `ts/scripts/stage1-analyze.ts`, `segment-analysis.ts`, `model-bias-analysis.ts`, `dispute-crosstab.ts`, `sample-disputes.ts` | Deep analytics on annotation data |
+| Judge benchmarking | `ts/scripts/judge-bench.ts` | Supports structured/tool modes, gold label comparison |
+| Judge diagnostics | `ts/scripts/judge-diag.ts`, `judge-diag-batch.ts` | GLM-5 failure investigation |
+| Model benchmarking | `ts/scripts/model-bench.ts` | Stage 1 candidate evaluation |
+
+---
+
+## Lessons Learned
+
+### On Prompt Engineering
+- Calibration examples beat rules. Each example targets a specific observed failure mode.
+- Pilots must be large enough (500+). 40-sample pilots were misleadingly optimistic.
+- More rules ≠ better. After the core structure is right, additional rules cause regression.
+- The `specific_facts` chain-of-thought schema (forcing models to enumerate evidence before deciding) was the single most impactful structural change.
+
+### On Model Selection
+- Reasoning tokens are the strongest predictor of accuracy, not price or model size.
+- Schema compliance varies — fix with Zod transforms, not prompt changes.
+- Test both structured output AND tool calling for any candidate. They are not equivalent.
+
+### On Evaluation
+- **Never evaluate against majority vote.** Build gold labels. Majority vote as ground truth makes models that rubber-stamp the majority look good.
+- **Judge confidence is highly predictive** of accuracy. Use it to weight training samples.
+- **Stage 1 confidence is useless** — cheap models are systematically overconfident (95%+ all-high).
+
+### On Data Quality at Scale
+- The biggest wins come from understanding *where* and *why* models disagree, not from blanket improvements.
+- Systematic model biases are quantifiable and predictable. Use them as signal, not noise.
+- Codebook ambiguity causes more disagreement than model limitations. Three codebook rulings resolved more disputes than any prompt change.
+- Not all labels need the same treatment. Confidence-stratified assembly beats uniform labeling.
diff --git a/docs/PROJECT-OVERVIEW.md b/docs/PROJECT-OVERVIEW.md
new file mode 100644
index 0000000..a2a6272
--- /dev/null
+++ b/docs/PROJECT-OVERVIEW.md
@@ -0,0 +1,243 @@
+# SEC Cybersecurity Disclosure Quality Classifier
+
+## Project Summary
+
+Build a validated, reusable classifier that labels SEC cybersecurity disclosures by **content category** and **specificity level**, then fine-tune an open-weights encoder model for deployment at scale.
+
+**Methodology:** Ringel (2023) "Synthetic Experts" pipeline — use frontier LLMs to generate training labels, then distill into a small open-weights encoder model.
+
+**Construct:** Project 3 from the Capstone Constructs document — "Cybersecurity Governance and Incident Disclosure Quality (SEC-Aligned)."
+
+**Three publishable artifacts:**
+1. A novel dataset of extracted Item 1C disclosures (no public HuggingFace dataset exists)
+2. A labeling methodology for cybersecurity disclosure quality
+3. A SOTA classifier (SEC-ModernBERT-large — first SEC-specific ModernBERT)
+
+---
+
+## Why This Matters
+
+Cybersecurity risk is among the most financially material operational risks facing firms. In July 2023, the SEC adopted Release 33-11216 requiring:
+- **Annual disclosure** of cybersecurity risk management, strategy, and governance (10-K Item 1C)
+- **Incident disclosure** within 4 business days of materiality determination (8-K Item 1.05)
+
+Investors, boards, and regulators need tools to assess whether disclosures are substantive or boilerplate, whether governance structures are robust or ceremonial, and whether incident reports are timely and informative. **No validated, construct-aligned classifier exists for this purpose.**
+
+### Stakeholder
+
+Compliance officers, investor relations teams, institutional investors, and regulators who need to assess disclosure quality at scale across thousands of filings.
+
+### What Decisions Classification Enables
+
+- **Investors:** Screen for governance quality; identify firms with weak cyber posture before incidents
+- **Regulators:** Flag filings that may not meet the spirit of the rule
+- **Boards:** Benchmark their own disclosures against peers
+- **Researchers:** Large-scale empirical studies of disclosure quality
+
+### Error Consequences
+
+- **False positive (labels boilerplate as specific):** Overstates disclosure quality — less harmful
+- **False negative (labels specific as boilerplate):** Understates quality — could unfairly penalize well-governed firms. More harmful for investment decisions.
+
+### Why Now
+
+- ~9,000-10,000 filings exist (FY2023 + FY2024 cycles)
+- iXBRL CYD taxonomy went live Dec 2024 — programmatic extraction now possible
+- Volume makes manual review infeasible; leadership needs scalable measurement
+
+---
+
+## Construct Definition
+
+**Theoretical foundation:** Disclosure theory (Verrecchia, 2001) and regulatory compliance as information provision. The SEC rule itself provides a natural taxonomy — its structured requirements map directly to a multi-class classification task.
+
+**Unit of analysis:** The paragraph within Item 1C (10-K) or Item 1.05 (8-K).
+
+**Two classification dimensions applied simultaneously:**
+
+### Dimension 1: Content Category (single-label, 7 classes)
+
+| Category | SEC Basis | What It Covers |
+|----------|-----------|----------------|
+| Board Governance | 106(c)(1) | Board/committee oversight, briefing frequency, board cyber expertise |
+| Management Role | 106(c)(2) | CISO/CTO identification, qualifications, reporting structure |
+| Risk Management Process | 106(b) | Assessment processes, ERM integration, framework references |
+| Third-Party Risk | 106(b) | Vendor oversight, external assessors, supply chain risk |
+| Incident Disclosure | 8-K 1.05 | Nature/scope/timing of incidents, material impact, remediation |
+| Strategy Integration | 106(b)(2) | Material impact on business strategy, cyber insurance, resource allocation |
+| None/Other | — | Boilerplate intros, legal disclaimers, non-cybersecurity content |
+
+### Dimension 2: Specificity (4-point ordinal scale)
+
+| Level | Label | Decision Test |
+|-------|-------|---------------|
+| 1 | Generic Boilerplate | "Could I paste this into a different company's filing unchanged?" → Yes |
+| 2 | Sector-Adapted | "Does this name something specific but not unique to THIS company?" → Yes |
+| 3 | Firm-Specific | "Does this contain at least one fact unique to THIS company?" → Yes |
+| 4 | Quantified-Verifiable | "Could an outsider verify a specific claim in this paragraph?" → Yes |
+
+Full rubric details, examples, and boundary rules are in [LABELING-CODEBOOK.md](LABELING-CODEBOOK.md).
+
+---
+
+## Deliverables Checklist
+
+### A) Executive Memo (max 5 pages)
+- [ ] Construct definition + why it matters + theoretical grounding
+- [ ] Data source + governance/ethics
+- [ ] Label schema overview
+- [ ] Results summary: best GenAI vs best specialist
+- [ ] Cost/time/reproducibility comparison
+- [ ] Recommendation for a real firm
+
+### B) Technical Appendix (slides or PDF)
+- [ ] Pipeline diagram (data → labels → model → evaluation)
+- [ ] Label codebook
+- [ ] Benchmark table (6+ GenAI models from 3+ suppliers)
+- [ ] Fine-tuning experiments + results
+- [ ] Error analysis: where does it fail and why?
+
+### C) Code + Artifacts
+- [ ] Reproducible notebooks
+- [ ] Datasets: holdout with human labels, train/test with GenAI labels, all model labels per run + majority labels
+- [ ] Saved fine-tuned model + inference script (link to shared drive, not Canvas)
+- [ ] Cost/time log
+
+---
+
+## Grading Rubric (100%)
+
+| Component | Weight |
+|-----------|--------|
+| Business framing & construct clarity | 20% |
+| Data pipeline quality + documentation | 15% |
+| Human labeling process + reliability | 15% |
+| GenAI benchmarking rigor | 20% |
+| Fine-tuning rigor + evaluation discipline | 20% |
+| Final comparison + recommendation quality | 10% |
+
+### Grade Targets
+
+**C range:** F1 > 0.80, performance comparison, labeled datasets, documentation, reproducible notebooks
+
+**B range (C + 3 of these):**
+- Cost, time, reproducibility analysis
+- 6+ models from 3+ suppliers
+- Contemporary data you collected (not off-the-shelf)
+- Compelling business case
+
+**A range (B + 3 of these):**
+- Error analysis (corner cases, rare/complex texts)
+- Mitigation strategy for identified model weaknesses
+- Additional baselines (dictionaries, topic models, etc.)
+- Comparison to amateur labels
+
+---
+
+## Corpus Size
+
+| Filing Type | Estimated Count |
+|-------------|----------------|
+| 10-K with Item 1C (FY2023 cycle) | ~4,500 |
+| 10-K with Item 1C (FY2024 cycle) | ~4,500 |
+| 8-K cybersecurity incidents | ~80 filings |
+| **Total filings** | **~9,000-10,000** |
+| **Estimated paragraphs** | **~50,000-80,000** |
+
+### Data Targets (per syllabus)
+
+- **20,000 texts** for train/test (GenAI-labeled)
+- **1,200 texts** for locked holdout (human-labeled, 3 annotators each)
+
+---
+
+## Team Roles (6 people)
+
+| Role | Responsibility |
+|------|---------------|
+| Data Lead | EDGAR extraction pipeline, paragraph segmentation, data cleaning |
+| Data Support | 8-K extraction, breach database cross-referencing, dataset QA |
+| Labeling Lead | Rubric refinement, GenAI prompt engineering, MMC pipeline orchestration |
+| Annotation | Gold set human labeling, inter-rater reliability, active learning review |
+| Model Lead | DAPT pre-training, classification fine-tuning, ablation experiments |
+| Eval & Writing | Validation tests, metrics computation, final presentation, documentation |
+
+---
+
+## 3-Week Schedule
+
+### Week 1: Data + Rubric
+- Set up EDGAR extraction pipeline (edgar-crawler + sec-edgar-downloader)
+- Set up 8-K extraction (sec-8k-item105)
+- Draft and pilot labeling rubric v1 on 30 paragraphs
+- Begin bulk 10-K download (FY2023 + FY2024 cycles)
+- Extract all 8-K cyber filings (Items 1.05, 8.01, 7.01)
+- Build company metadata table (CIK → ticker → GICS sector → market cap)
+- Compare pilot labels, compute initial inter-rater agreement, revise rubric → v2
+- Begin DAPT pre-training (SEC-ModernBERT-large, ~2-3 days on 3090)
+- **Friday milestone:** Full paragraph corpus ready (~50K+), 8-K dataset complete, evaluation framework ready
+- Launch Stage 1 dual annotation (Sonnet + Gemini Flash) on full corpus
+
+### Week 2: Labeling + Training
+- Monitor and complete dual annotation
+- Gold set human labeling (300-500 paragraphs, stratified, 2+ annotators)
+- Extract disagreements (~17%), run Stage 2 judge panel (Opus + GPT-5 + Gemini Pro)
+- Active learning pass on low-confidence cases
+- Fine-tuning experiments: DeBERTa baseline → ModernBERT → SEC-ModernBERT → NeoBERT → Ensemble
+- **Wednesday milestone:** Gold set validated, Kappa computed
+- **Friday milestone:** Labeled dataset finalized, all training complete
+
+### Week 3: Evaluation + Presentation
+- Publish dataset to HuggingFace
+- Run validation tests (breach prediction, known-groups, boilerplate index)
+- Write all sections, create figures
+- Code cleanup, README
+- **Thursday:** Full team review and rehearsal
+- **Friday:** Presentation day
+
+### Critical Path
+```
+Data extraction → Paragraph corpus → GenAI labeling → Judge panel → Final labels
+ ↓
+Rubric design → Pilot → Rubric v2 ──────────────────────────────────→ Gold set validation
+ ↓
+DAPT pre-training ─────→ Fine-tuning experiments ──→ Evaluation ──→ Final comparison
+```
+
+---
+
+## Budget
+
+| Item | Cost |
+|------|------|
+| GenAI Stage 1 dual annotation (50K × 2 models, batch) | ~$115 |
+| GenAI Stage 2 judge panel (~8.5K × 3 models, batch) | ~$55 |
+| Prompt caching savings | -$30 to -$40 |
+| SEC EDGAR data | $0 |
+| Breach databases | $0 |
+| Compute (RTX 3090, owned) | $0 |
+| **Total** | **~$130-170** |
+
+---
+
+## GPU-Free Work (next 2 days)
+
+Everything below can proceed without GPU:
+
+- [ ] Set up project repo structure, dependencies, environment
+- [ ] Build EDGAR extraction pipeline (download + parse Item 1C)
+- [ ] Build 8-K extraction pipeline
+- [ ] Paragraph segmentation logic
+- [ ] Company metadata table (CIK → ticker → GICS sector)
+- [ ] Download PleIAs/SEC corpus for future DAPT
+- [ ] Refine labeling rubric, create pilot samples
+- [ ] Set up GenAI labeling scripts (batch API calls)
+- [ ] Set up evaluation framework (metrics computation code)
+- [ ] Download breach databases (PRC, VCDB, CISA KEV)
+- [ ] Gold set sampling strategy
+- [ ] Begin human labeling of pilot set
+
+### GPU-Required (deferred)
+- DAPT pre-training of SEC-ModernBERT-large (~2-3 days on 3090)
+- All classification fine-tuning experiments
+- Model inference and evaluation
diff --git a/docs/TECHNICAL-GUIDE.md b/docs/TECHNICAL-GUIDE.md
new file mode 100644
index 0000000..13cc48e
--- /dev/null
+++ b/docs/TECHNICAL-GUIDE.md
@@ -0,0 +1,478 @@
+# Technical Guide — SEC Cybersecurity Disclosure Classifier
+
+Everything needed to build the pipeline: data acquisition, GenAI labeling, model training, evaluation, and references.
+
+**Stack:** TypeScript (bun) for data/labeling/eval, Python (uv) for training. Vercel AI SDK v6 + OpenRouter for all LLM calls. HuggingFace Trainer for encoder training, Unsloth for decoder experiment.
+
+---
+
+## 1. Data Acquisition
+
+### 1.1 Extracting 10-K Item 1C
+
+**Pipeline:**
+```
+EDGAR API → download 10-K HTML → extract Item 1C → paragraph segmentation → JSONL
+```
+
+**Tools:**
+
+| Tool | Purpose | Install | Notes |
+|------|---------|---------|-------|
+| `sec-edgar-downloader` | Bulk download 10-K filings by CIK | `uv add sec-edgar-downloader` | Pure downloader, no parsing |
+| `edgar-crawler` | Extract specific item sections to JSON | `git clone github.com/lefterisloukas/edgar-crawler` | Configure `['1C']` in items list |
+| `edgartools` | Interactive exploration, XBRL parsing | `uv add edgartools` | `tenk['Item 1C']` accessor; great for prototyping |
+
+**EDGAR API requirements:**
+- Rate limit: 10 requests/second
+- Required: Custom `User-Agent` header with name and email (e.g., `"sec-cyBERT team@email.com"`)
+- SEC blocks requests without proper User-Agent (returns 403)
+
+**For iXBRL-tagged filings (2025+):** Use `edgartools` XBRL parser to extract CYD taxonomy elements directly. The `cyd` prefix tags give pre-structured data aligned with regulatory categories.
+
+**Fallback corpus:** `PleIAs/SEC` on HuggingFace (373K 10-K full texts, CC0 license) — sections NOT pre-parsed; must extract Item 1C yourself.
+
+### 1.2 Extracting 8-K Incident Disclosures
+
+| Tool | Purpose |
+|------|---------|
+| `sec-8k-item105` | Extract Item 1.05 from 8-Ks, iXBRL + HTML fallback — `github.com/JMousqueton/sec-8k-item105` |
+| `SECurityTr8Ker` | Monitor SEC RSS for new cyber 8-Ks — `github.com/pancak3lullz/SECurityTr8Ker` |
+| Debevoise 8-K Tracker | Curated list with filing links — `debevoisedatablog.com` |
+| Board Cybersecurity Tracker | Links filings to MITRE ATT&CK — `board-cybersecurity.com/incidents/tracker` |
+
+**Critical:** Must capture Item 1.05 AND Items 8.01/7.01 (post-May 2024 shift where companies moved non-material disclosures away from 1.05).
+
+### 1.3 Paragraph Segmentation
+
+Once Item 1C text is extracted:
+- Split on double newlines or `
` tags (depending on extraction format)
+- **Minimum** paragraph length: 20 words (filter out headers, whitespace)
+- **Maximum** paragraph length: 500 words (split longer blocks at sentence boundaries)
+- Preserve metadata: company name, CIK, ticker, filing date, fiscal year
+
+**Expected yield:** ~5-8 paragraphs per Item 1C × ~9,000 filings = **~50,000-70,000 paragraphs**
+
+### 1.4 Pre-Existing Datasets
+
+| Resource | What It Is | License |
+|----------|-----------|---------|
+| [PleIAs/SEC](https://huggingface.co/datasets/PleIAs/SEC) | 373K full 10-K texts | CC0 |
+| [EDGAR-CORPUS](https://huggingface.co/datasets/eloukas/edgar-corpus) | 220K filings with sections pre-parsed | Apache 2.0 |
+| [Board Cybersecurity 23-Feature Analysis](https://www.board-cybersecurity.com/research/insights/) | Regex extraction of 23 governance features from 4,538 10-Ks | Research |
+| [Gibson Dunn S&P 100 Survey](https://corpgov.law.harvard.edu/2025/01/09/cybersecurity-disclosure-overview-a-survey-of-form-10-k-cybersecurity-disclosures-by-sp-100-companies/) | Detailed disclosure feature analysis | Research |
+| [Florackis et al. (2023)](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3725130) | Firm-level cyber risk measure from 10-K text | SSRN |
+| [zeroshot/cybersecurity-corpus](https://huggingface.co/datasets/zeroshot/cybersecurity-corpus) | General cybersecurity text (useful for DAPT) | HuggingFace |
+
+---
+
+## 2. GenAI Labeling Pipeline
+
+All LLM calls go through **OpenRouter** via `@openrouter/ai-sdk-provider` + Vercel AI SDK v6 `generateObject`. OpenRouter returns actual cost in `usage.cost` — no estimation needed.
+
+### 2.1 Model Panel
+
+**Stage 1 — Three Independent Annotators (all ~50K paragraphs):**
+
+All three are reasoning models. Use low reasoning effort to get a cheap thinking pass without blowing up token costs.
+
+| Model | OpenRouter ID | Role | Reasoning |
+|-------|--------------|------|-----------|
+| Gemini 3.1 Flash Lite | `google/gemini-3.1-flash-lite-preview` | Cheap + capable | Low effort |
+| MiMo-V2-Flash | `xiaomi/mimo-v2-flash` | Xiaomi reasoning flash | Low effort |
+| Grok 4.1 Fast | `x-ai/grok-4.1-fast` | xAI fast tier | Low effort |
+
+Provider diversity: Google, Xiaomi, xAI — three different architectures, minimizes correlated errors.
+
+**Stage 2 — Judge for Disagreements (~15-20% of paragraphs):**
+
+| Model | OpenRouter ID | Role | Reasoning |
+|-------|--------------|------|-----------|
+| Claude Sonnet 4.6 | `anthropic/claude-sonnet-4.6` | Tiebreaker judge | Medium effort |
+
+**Full Benchmarking Panel (run on 1,200 holdout alongside human labels):**
+
+The Stage 1 models plus 6 SOTA frontier models — 9 total from 8+ providers.
+
+| Model | OpenRouter ID | Provider | Reasoning |
+|-------|--------------|----------|-----------|
+| Gemini 3.1 Flash Lite | `google/gemini-3.1-flash-lite-preview` | Google | Low |
+| MiMo-V2-Flash | `xiaomi/mimo-v2-flash` | Xiaomi | Low |
+| Grok 4.1 Fast | `x-ai/grok-4.1-fast` | xAI | Low |
+| GPT-5.4 | `openai/gpt-5.4` | OpenAI | Medium |
+| Claude Sonnet 4.6 | `anthropic/claude-sonnet-4.6` | Anthropic | Medium |
+| Gemini 3.1 Pro Preview | `google/gemini-3.1-pro-preview` | Google | Medium |
+| GLM-5 | `zhipu/glm-5` | Zhipu AI | Medium |
+| MiniMax-M2.7 | `minimax/minimax-m2.7` | MiniMax | Medium |
+| MiMo-V2-Pro | `xiaomi/mimo-v2-pro` | Xiaomi | Medium |
+
+That's **9 models from 8 providers**, far exceeding the 6-from-3 requirement. All support structured outputs on OpenRouter.
+
+### 2.2 Consensus Algorithm
+
+**Stage 1: 3-model majority vote.**
+- Each of 3 models independently labels every paragraph via `generateObject` with the `LabelOutput` Zod schema (includes per-dimension confidence ratings).
+- For each paragraph, compare the 3 labels on both dimensions (category + specificity).
+- If 2/3 or 3/3 agree on BOTH dimensions → consensus reached.
+- Expected agreement rate: ~80-85%.
+- **Confidence-aware routing:** Even when models agree, if all 3 report "low" confidence on either dimension, route to Stage 2 judge anyway. These are hard cases that deserve a stronger model's opinion.
+
+**Stage 2: Judge tiebreaker.**
+- Claude Sonnet 4.6 (medium reasoning effort) receives the paragraph + all 3 Stage 1 labels (randomized order for anti-bias).
+- Judge's label is treated as authoritative — if judge agrees with any Stage 1 model on both dimensions, that label wins. Otherwise judge's label is used directly.
+- Remaining unresolved cases (~1-2%) flagged for human review.
+
+**Stage 3: Active learning pass.**
+- Cluster low-confidence cases by embedding similarity.
+- Human-review ~2-5% of total to identify systematic rubric failures.
+- Iterate rubric if patterns emerge, re-run affected subsets.
+
+### 2.3 Reasoning Configuration
+
+All Stage 1 and benchmark models are reasoning-capable. We use provider-appropriate "low" or "medium" effort settings to balance quality and cost.
+
+**OpenRouter reasoning params** (passed via `providerOptions` or model-specific params):
+- **Google Gemini**: `thinkingConfig: { thinkingBudget: 256 }` (low) / `1024` (medium)
+- **Xiaomi MiMo**: Thinking is default-on; use `reasoning_effort: "low"` / `"medium"` if supported
+- **xAI Grok**: `reasoning_effort: "low"` / `"medium"`
+- **OpenAI GPT-5.4**: `reasoning: { effort: "low" }` / `"medium"`
+- **Anthropic Claude**: `thinking: { budgetTokens: 512 }` (low) / `2048` (medium)
+
+Exact param names may vary per model on OpenRouter — verify during pilot. The reasoning tokens are tracked separately in `usage.completion_tokens_details.reasoning_tokens`.
+
+### 2.4 Cost Tracking
+
+OpenRouter returns **actual cost** in `usage.cost` for every response. No estimation needed. Reasoning tokens are included in cost automatically.
+
+### 2.5 Rate Limiting
+
+OpenRouter uses **credit-based limiting** for paid accounts, not fixed RPM. Your key shows `requests: -1` (unlimited). There is no hard request-per-second cap — only Cloudflare DDoS protection if you dramatically exceed reasonable usage.
+
+**Our approach:** Use `p-limit` concurrency control, starting at 10-15 concurrent requests. Ramp up if no 429s or latency degradation. Monitor account usage via `GET /api/v1/key`.
+
+### 2.4 Technical Implementation
+
+**Core pattern:** `generateObject` with Zod schema via OpenRouter.
+
+```typescript
+import { generateObject } from "ai";
+import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+import { LabelOutput } from "../schemas/label";
+
+const openrouter = createOpenRouter();
+
+const result = await generateObject({
+ model: openrouter("google/gemini-3.1-flash-lite-preview"),
+ schema: LabelOutput,
+ system: SYSTEM_PROMPT,
+ prompt: buildUserPrompt(paragraph),
+ temperature: 0,
+ mode: "json",
+ // Reasoning effort — model-specific, set per provider
+ providerOptions: {
+ google: { thinkingConfig: { thinkingBudget: 256 } },
+ },
+});
+
+// result.object: { content_category, specificity_level, category_confidence, specificity_confidence, reasoning }
+// result.usage: { promptTokens, completionTokens }
+// OpenRouter response body also includes usage.cost (actual USD)
+// and usage.completion_tokens_details.reasoning_tokens
+```
+
+**Generation ID tracking:** Every OpenRouter response includes an `id` field (the generation ID). We store this in every annotation record for audit trail and `GET /api/v1/generation?id={id}` lookup.
+
+**Batch processing:** Concurrency-limited via `p-limit` (start at 10-15 concurrent). Each successful annotation is appended immediately to JSONL (crash-safe checkpoint). On resume, completed paragraph IDs are read from the output file and skipped. Graceful shutdown on SIGINT — wait for in-flight requests, write session summary.
+
+**Structured output:** All panel models support `structured_outputs` on OpenRouter. Use `mode: "json"` in `generateObject`. Response Healing plugin (`plugins: [{ id: 'response-healing' }]`) available for edge cases.
+
+**Live observability:** Every script that hits APIs renders a live dashboard to stderr (progress, ETA, session cost, latency percentiles, reasoning token usage). Session summaries append to `data/metadata/sessions.jsonl`.
+
+**Prompt tuning before scale:** See LABELING-CODEBOOK.md for the 4-phase iterative prompt tuning protocol. Micro-pilot (30 paragraphs) → prompt revision → scale pilot (200 paragraphs) → green light. Do not fire the full 50K run until the scale pilot passes agreement targets.
+
+---
+
+## 3. Model Strategy
+
+### 3.1 Primary: SEC-ModernBERT-large
+
+**This model does not exist publicly. Building it is a core contribution.**
+
+**Base model:** [`answerdotai/ModernBERT-large`](https://huggingface.co/answerdotai/ModernBERT-large)
+- 395M parameters
+- 8,192-token native context (vs. 512 for DeBERTa-v3-large)
+- RoPE + alternating local/global attention + FlashAttention
+- 2-4x faster than DeBERTa-v3-large
+- Apache 2.0 license
+- GLUE: 90.4
+
+**Step 1 — Domain-Adaptive Pre-Training (DAPT):**
+
+Continue MLM pre-training on SEC filing text to create "SEC-ModernBERT-large":
+- **Training corpus:** 200-500M tokens from PleIAs/SEC or own EDGAR download. Include 10-Ks, 10-Qs, 8-Ks, proxy statements.
+- **MLM objective:** 30% masking rate (ModernBERT convention)
+- **Learning rate:** ~5e-5 (search range: 1e-5 to 1e-4)
+- **Hardware (RTX 3090):** bf16, gradient checkpointing, seq_len=1024-2048, batch_size=2-4 + gradient accumulation to effective batch 16-32
+- **VRAM estimate:** ~12-15GB at batch=4, seq=2048 with gradient checkpointing — fits on 3090
+- **Duration:** ~2-3 days on single 3090
+- **Framework:** HuggingFace Trainer + `DataCollatorForLanguageModeling` (Python script, not notebook)
+
+**Evidence DAPT works:**
+- Gururangan et al. (2020): consistent improvements across all tested domains
+- Clinical ModernBERT, BioClinical ModernBERT: successful continued MLM on medical text
+- Patent domain ModernBERT (arXiv:2509.14926): +0.9 to +2.8 F1 from continued pre-training on 31.6B tokens
+- SEC filing scaling laws (arXiv:2512.12384): consistent improvement, largest gains in first 200M tokens
+
+**Step 2 — Classification Fine-Tuning:**
+
+Fine-tune SEC-ModernBERT-large on the labeled paragraphs:
+- **Architecture:** Shared encoder backbone → dropout → two linear classification heads
+ - `category_head`: 7-class softmax (content category)
+ - `specificity_head`: 4-class softmax (specificity level)
+- **Loss:** `α × CE(category) + (1-α) × CE(specificity) + β × SCL`
+ - `α` (category_weight): default 0.5, searchable
+ - `β` (scl_weight): default 0, searchable (ablation)
+- **Sequence length:** 2048 tokens
+- **VRAM:** ~11-13GB at batch=8, seq=2048 in bf16 — comfortable on 3090
+- **bf16=True** in HuggingFace Trainer (3090 Ampere supports natively)
+- **Framework:** Custom `MultiHeadClassifier` model + HuggingFace Trainer subclass
+
+### 3.2 Dark Horse: NeoBERT
+
+[`chandar-lab/NeoBERT`](https://huggingface.co/chandar-lab/NeoBERT)
+- 250M parameters (100M fewer than ModernBERT-large)
+- 4,096-token context
+- SwiGLU, RoPE, Pre-RMSNorm, FlashAttention
+- GLUE: 89.0 | MTEB: 51.3 (best in class — ModernBERT is 46.9)
+- MIT license
+- Requires `trust_remote_code=True`
+
+Same DAPT + fine-tuning pipeline, even less VRAM. Interesting efficiency vs. quality tradeoff.
+
+### 3.3 Baseline: DeBERTa-v3-large
+
+[`microsoft/deberta-v3-large`](https://huggingface.co/microsoft/deberta-v3-large)
+- ~435M total parameters
+- 512-token context (can push to ~1024)
+- GLUE: 91.4 (highest among encoders)
+- MIT license
+- **Weakness:** no long context, fails at retrieval
+
+Include as baseline to show improvement from (a) long context and (b) DAPT.
+
+### 3.4 Decoder Experiment: Qwen3.5 via Unsloth
+
+Experimental comparison of encoder vs. decoder approach:
+- **Model:** Qwen3.5-1.5B or Qwen3.5-7B (smallest viable decoder)
+- **Framework:** Unsloth (2x faster than Axolotl, 80% less VRAM, optimized for Qwen)
+- **Method:** QLoRA fine-tuning — train the model to output the same JSON schema as the GenAI labelers
+- **Purpose:** "Additional baseline" for A-grade requirement + demonstrates encoder advantage for classification
+
+### 3.5 Domain-Specific Baselines (for comparison)
+
+All BERT-base (110M params, 512 context) — architecturally outdated:
+
+| Model | HuggingFace ID | Domain |
+|-------|---------------|--------|
+| SEC-BERT | `nlpaueb/sec-bert-base` | 260K 10-K filings |
+| FinBERT | `ProsusAI/finbert` | Financial sentiment |
+| SecureBERT | arXiv:2204.02685 | Cybersecurity text |
+
+### 3.6 Ablation Design
+
+| # | Experiment | Model | Context | DAPT | SCL | Purpose |
+|---|-----------|-------|---------|------|-----|---------|
+| 1 | Baseline | DeBERTa-v3-large | 512 | No | No | Standard approach per syllabus |
+| 2 | + Long context | ModernBERT-large | 2048 | No | No | Context window benefit |
+| 3 | + Domain adapt | SEC-ModernBERT-large | 2048 | Yes | No | DAPT benefit |
+| 4 | + Contrastive | SEC-ModernBERT-large | 2048 | Yes | Yes | SCL benefit |
+| 5 | Efficiency | NeoBERT (+ DAPT) | 2048 | Yes | Yes | 40% fewer params |
+| 6 | Decoder | Qwen3.5 LoRA | 2048 | No | No | Encoder vs decoder |
+| 7 | **Ensemble** | SEC-ModernBERT + DeBERTa | mixed | mixed | — | Maximum performance |
+
+### 3.7 Hyperparameter Search (Autoresearch Pattern)
+
+Inspired by Karpathy's [autoresearch](https://github.com/karpathy/autoresearch): an agent autonomously iterates on training configs using a `program.md` directive.
+
+**How it works:**
+1. Agent reads `program.md` which defines: fixed time budget (30 min), evaluation metric (`val_macro_f1`), what can be modified (YAML config values), what cannot (data splits, eval script, seed)
+2. Agent modifies one hyperparameter in the YAML config
+3. Agent runs training for 30 minutes
+4. Agent evaluates on validation set
+5. If `val_macro_f1` improved by ≥ 0.002 → keep checkpoint, else discard
+6. Agent logs result to `results/experiments.tsv` and repeats
+
+**Search spaces:**
+
+DAPT:
+- learning_rate: [1e-5, 2e-5, 5e-5, 1e-4]
+- mlm_probability: [0.15, 0.20, 0.30]
+- max_seq_length: [1024, 2048]
+- effective batch size: [8, 16, 32]
+
+Encoder fine-tuning:
+- learning_rate: [1e-5, 2e-5, 3e-5, 5e-5]
+- category_weight: [0.3, 0.4, 0.5, 0.6, 0.7]
+- label_smoothing: [0, 0.05, 0.1]
+- scl_weight: [0, 0.1, 0.2, 0.5]
+- dropout: [0.05, 0.1, 0.2]
+- pool_strategy: ["cls", "mean"]
+- max_seq_length: [512, 1024, 2048]
+
+Decoder (Unsloth LoRA):
+- lora_r: [8, 16, 32, 64]
+- lora_alpha: [16, 32, 64]
+- learning_rate: [1e-4, 2e-4, 5e-4]
+
+---
+
+## 4. Evaluation & Validation
+
+### 4.1 Required Metrics
+
+| Metric | Target | Notes |
+|--------|--------|-------|
+| Macro-F1 on holdout | > 0.80 for C, higher for A | Per-class and overall |
+| Per-class F1 | Identify weak categories | Expect "None/Other" noisiest |
+| Krippendorff's Alpha | > 0.67 adequate, > 0.75 good | GenAI vs human gold set |
+| MCC | Report alongside F1 | More robust for imbalanced classes |
+| Specificity MAE | Report for ordinal dimension | Mean absolute error: |pred - true| |
+| Calibration plots | Reliability diagrams | For softmax outputs |
+| Robustness splits | By time, industry, filing size | FY2023 vs FY2024; GICS sector; word count quartiles |
+
+### 4.2 Downstream Validity Tests
+
+**Test 1 — Breach Prediction (strongest):**
+Do firms with lower specificity scores subsequently appear in breach databases?
+- [Privacy Rights Clearinghouse](http://dx.doi.org/10.17632/w33nhh3282.1) — 80K+ breaches, ticker/CIK matching
+- [VCDB](https://github.com/vz-risk/VCDB) — 8K+ incidents, VERIS schema
+- [Board Cybersecurity Incident Tracker](https://www.board-cybersecurity.com/incidents/tracker) — direct SEC filing links
+- [CISA KEV Catalog](https://www.cisa.gov/known-exploited-vulnerabilities-catalog) — known exploited vulnerabilities
+
+**Test 2 — Market Reaction (optional):**
+Event study: abnormal returns around 8-K Item 1.05 filing. Does prior Item 1C quality predict reaction magnitude? Small sample (~55 incidents) but high signal.
+
+**Test 3 — Known-Groups Validity (easy, always include):**
+Do regulated industries (NYDFS, HIPAA) produce higher-specificity disclosures? Do larger firms have more specific disclosures? Expected results that validate the measure.
+
+**Test 4 — Boilerplate Index (easy, always include):**
+Cosine similarity of each company's Item 1C to industry-median disclosure. Specificity score should inversely correlate — independent, construct-free validation.
+
+### 4.3 External Benchmark
+
+Per syllabus requirement:
+- **Board Cybersecurity's 23-feature regex extraction** — natural benchmark. Their binary feature coding is prior best practice. Our classifier captures everything their regex does plus quality/specificity.
+- **Florackis et al. (2023) cyber risk measure** — different section (1A vs 1C), different methodology, different era.
+
+---
+
+## 5. SEC Regulatory Context
+
+### The Rule: SEC Release 33-11216 (July 2023)
+
+**Item 1C (10-K Annual Disclosure) — Regulation S-K Item 106:**
+
+*Item 106(b) — Risk Management and Strategy:*
+1. Processes for assessing, identifying, and managing material cybersecurity risks
+2. Whether cybersecurity processes integrate into overall ERM
+3. Whether the company engages external assessors, consultants, or auditors
+4. Processes to oversee risks from third-party service providers
+5. Whether cybersecurity risks have materially affected business strategy, results, or financial condition
+
+*Item 106(c) — Governance:*
+- Board oversight (106(c)(1)): oversight description, responsible committee, information processes
+- Management's role (106(c)(2)): responsible positions, expertise, monitoring processes, board reporting frequency
+
+**Item 1.05 (8-K Incident Disclosure):**
+- Required within 4 business days of materiality determination
+- Material aspects of nature, scope, timing + material impact
+- No technical details that would impede response/remediation
+- AG can delay up to 120 days for national security
+
+**Key design note:** The SEC uses "describe" — non-exclusive suggestions create natural variation in specificity and content. This is what makes the construct classifiable.
+
+### Compliance Timeline
+
+| Date | Milestone |
+|------|-----------|
+| Jul 26, 2023 | Rule adopted |
+| Dec 15, 2023 | Item 1C required in 10-Ks |
+| Dec 18, 2023 | Item 1.05 required in 8-Ks |
+| Jun 15, 2024 | Item 1.05 required for smaller reporting companies |
+| Dec 15, 2024 | iXBRL tagging of Item 106 (CYD taxonomy) required |
+
+### iXBRL CYD Taxonomy
+
+Published Sep 16, 2024. Starting Dec 15, 2024, Item 1C tagged in Inline XBRL with `cyd` prefix.
+- Schema: `http://xbrl.sec.gov/cyd/2024`
+- [Taxonomy guide (PDF)](https://xbrl.sec.gov/cyd/2024/cyd-taxonomy-guide-2024-09-16.pdf)
+
+---
+
+## 6. References
+
+### SEC Rule & Guidance
+- [SEC Final Rule 33-11216 (PDF)](https://www.sec.gov/files/rules/final/2023/33-11216.pdf)
+- [SEC Fact Sheet](https://www.sec.gov/files/33-11216-fact-sheet.pdf)
+- [SEC Small Business Compliance Guide](https://www.sec.gov/resources-small-businesses/small-business-compliance-guides/cybersecurity-risk-management-strategy-governance-incident-disclosure)
+- [CYD iXBRL Taxonomy Guide (PDF)](https://xbrl.sec.gov/cyd/2024/cyd-taxonomy-guide-2024-09-16.pdf)
+
+### Law Firm Surveys & Analysis
+- [Gibson Dunn S&P 100 Survey](https://corpgov.law.harvard.edu/2025/01/09/cybersecurity-disclosure-overview-a-survey-of-form-10-k-cybersecurity-disclosures-by-sp-100-companies/)
+- [PwC First Wave of 10-K Cyber Disclosures](https://www.pwc.com/us/en/services/consulting/cybersecurity-risk-regulatory/sec-final-cybersecurity-disclosure-rules/sec-10-k-cyber-disclosures.html)
+- [Debevoise 8-K Tracker](https://www.debevoisedatablog.com/2024/03/06/cybersecurity-form-8-k-tracker/)
+- [Greenberg Traurig 2025 Trends](https://www.gtlaw.com/en/insights/2025/2/sec-cybersecurity-disclosure-trends-2025-update-on-corporate-reporting-practices)
+- [Known Trends: First Year of 8-K Filings](https://www.knowntrends.com/2025/02/snapshot-the-first-year-of-cybersecurity-incident-filings-on-form-8-k-since-adoption-of-new-rules/)
+- [NYU: Lessons Learned from 8-K Reporting](https://wp.nyu.edu/compliance_enforcement/2025/03/25/lessons-learned-one-year-of-form-8-k-material-cybersecurity-incident-reporting/)
+
+### Data Extraction Tools
+- [edgar-crawler](https://github.com/lefterisloukas/edgar-crawler)
+- [edgartools](https://github.com/dgunning/edgartools)
+- [sec-edgar-downloader](https://pypi.org/project/sec-edgar-downloader/)
+- [sec-8k-item105](https://github.com/JMousqueton/sec-8k-item105)
+- [SECurityTr8Ker](https://github.com/pancak3lullz/SECurityTr8Ker)
+- [SEC EDGAR APIs](https://www.sec.gov/search-filings/edgar-application-programming-interfaces)
+- [SEC EDGAR Full-Text Search](https://efts.sec.gov/LATEST/search-index)
+
+### Datasets
+- [PleIAs/SEC — 373K 10-K texts (CC0)](https://huggingface.co/datasets/PleIAs/SEC)
+- [EDGAR-CORPUS — 220K filings, sections parsed (Apache 2.0)](https://huggingface.co/datasets/eloukas/edgar-corpus)
+- [Board Cybersecurity 23-Feature Analysis](https://www.board-cybersecurity.com/research/insights/risk-frameworks-security-standards-in-10k-item-1c-cybersecurity-disclosures-through-2024-06-30/)
+- [Board Cybersecurity Incident Tracker](https://www.board-cybersecurity.com/incidents/tracker)
+- [PRC Mendeley Breach Dataset](http://dx.doi.org/10.17632/w33nhh3282.1)
+- [VCDB](https://github.com/vz-risk/VCDB)
+- [CISA KEV Catalog](https://www.cisa.gov/known-exploited-vulnerabilities-catalog)
+- [zeroshot/cybersecurity-corpus](https://huggingface.co/datasets/zeroshot/cybersecurity-corpus)
+
+### Models
+- [ModernBERT-large (Apache 2.0)](https://huggingface.co/answerdotai/ModernBERT-large)
+- [ModernBERT-base (Apache 2.0)](https://huggingface.co/answerdotai/ModernBERT-base)
+- [NeoBERT (MIT)](https://huggingface.co/chandar-lab/NeoBERT)
+- [DeBERTa-v3-large (MIT)](https://huggingface.co/microsoft/deberta-v3-large)
+- [SEC-BERT](https://huggingface.co/nlpaueb/sec-bert-base)
+- [FinBERT](https://huggingface.co/ProsusAI/finbert)
+- [EvasionBench Eva-4B-V2](https://huggingface.co/FutureMa/Eva-4B-V2)
+
+### Key Papers
+- Ringel (2023), "Creating Synthetic Experts with Generative AI" — [SSRN:4542949](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4542949)
+- Ludwig et al. (2026), "Extracting Consumer Insight from Text" — [arXiv:2602.15312](https://arxiv.org/abs/2602.15312)
+- Ma et al. (2026), "EvasionBench" — [arXiv:2601.09142](https://arxiv.org/abs/2601.09142)
+- Florackis et al. (2023), "Cybersecurity Risk" — [SSRN:3725130](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3725130)
+- Gururangan et al. (2020), "Don't Stop Pretraining" — [arXiv:2004.10964](https://arxiv.org/abs/2004.10964)
+- ModernBERT — [arXiv:2412.13663](https://arxiv.org/abs/2412.13663)
+- NeoBERT — [arXiv:2502.19587](https://arxiv.org/abs/2502.19587)
+- ModernBERT vs DeBERTa-v3 — [arXiv:2504.08716](https://arxiv.org/abs/2504.08716)
+- Patent domain ModernBERT DAPT — [arXiv:2509.14926](https://arxiv.org/abs/2509.14926)
+- SEC filing scaling laws — [arXiv:2512.12384](https://arxiv.org/abs/2512.12384)
+- Gunel et al. (2020), Supervised Contrastive Learning — [OpenReview](https://openreview.net/forum?id=cu7IUiOhujH)
+- Phil Schmid, "Fine-tune ModernBERT" — [philschmid.de](https://www.philschmid.de/fine-tune-modern-bert-in-2025)
+- Berkman et al. (2018), Cybersecurity disclosure quality scoring
+- SecureBERT — [arXiv:2204.02685](https://arxiv.org/abs/2204.02685)
+- Gilardi et al. (2023), "ChatGPT Outperforms Crowd-Workers" — [arXiv:2303.15056](https://arxiv.org/abs/2303.15056)
+- Kiefer et al. (2025), ESG-Activities benchmark — [arXiv:2502.21112](https://arxiv.org/abs/2502.21112)
+
+### Methodological Resources
+- [Ringel 2026 Capstone Pipeline Example (ipynb)](http://ringel.ai/UNC/2026/helpers/Ringel_2026_VerticalAI_Capstone_Pipeline_Example.ipynb)
+- [Ringel 2026 Capstone Pipeline Example (zip)](http://ringel.ai/UNC/2026/helpers/Ringel_2026_VerticalAI_Capstone_Pipeline_Example.zip)
+- [Class 21 Exemplary Presentation (PDF)](http://www.ringel.ai/UNC/2026/BUSI488/Class21/Ringel_488-2026_Class21.pdf)
+- [Karpathy autoresearch](https://github.com/karpathy/autoresearch) — autonomous HP search pattern
diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md
new file mode 100644
index 0000000..64a1857
--- /dev/null
+++ b/docs/implementation-plan.md
@@ -0,0 +1,345 @@
+# SEC-cyBERT Implementation Plan
+
+## Context
+
+Building an SEC cybersecurity disclosure quality classifier for the BUSI488/COMP488 capstone. The Ringel (2023) "Synthetic Experts" pipeline: frontier LLMs label ~50K paragraphs, then distill into a small encoder model. Two dimensions: content category (7-class) + specificity (4-point ordinal). GPU is offline for 2 days — all data/labeling/eval infrastructure is GPU-free and should be built now.
+
+---
+
+## Tech Stack
+
+| Layer | Tool | Notes |
+|-------|------|-------|
+| Data/labeling pipeline | TypeScript, Vercel AI SDK 6.0.108, `@openrouter/ai-sdk-provider`, Zod | `generateObject` with Zod schemas for structured output |
+| Stage 1 annotators | gpt-oss-120b, mimo-v2-flash, grok-4.1-fast | Via OpenRouter |
+| Stage 2 judge | Claude Sonnet 4.6 | Via OpenRouter, called only on disagreements |
+| Encoder training | HuggingFace Trainer, Python scripts | ModernBERT-large, NeoBERT, DeBERTa-v3-large |
+| DAPT | HuggingFace Trainer + DataCollatorForLanguageModeling | Continued MLM on SEC filings |
+| Decoder experiment | Unsloth (NOT Axolotl — it's decoder-only and slower) | Qwen3.5 LoRA |
+| HP search | Autoresearch-style `program.md` directives | Agent edits YAML, trains for fixed budget, evaluates, keeps/discards |
+| Runtime | bun (TS), uv (Python) | |
+
+---
+
+## Project Structure
+
+```
+sec-cyBERT/
+├── docs/
+│ ├── PROJECT-OVERVIEW.md
+│ ├── LABELING-CODEBOOK.md
+│ └── TECHNICAL-GUIDE.md
+│
+├── ts/ # TypeScript: data pipeline, labeling, eval
+│ ├── package.json
+│ ├── tsconfig.json
+│ ├── src/
+│ │ ├── schemas/ # Zod schemas (single source of truth)
+│ │ │ ├── filing.ts
+│ │ │ ├── paragraph.ts
+│ │ │ ├── label.ts # LabelOutput — passed to generateObject
+│ │ │ ├── annotation.ts # Label + provenance (model, cost, latency)
+│ │ │ ├── consensus.ts # Multi-model agreement result
+│ │ │ ├── gold.ts # Human-labeled holdout entry
+│ │ │ ├── benchmark.ts # Model performance metrics
+│ │ │ ├── experiment.ts # Autoresearch training tracker
+│ │ │ └── index.ts
+│ │ ├── extract/ # Phase 1: EDGAR extraction
+│ │ │ ├── download-10k.ts
+│ │ │ ├── parse-item1c.ts
+│ │ │ ├── parse-8k.ts
+│ │ │ ├── segment.ts
+│ │ │ └── metadata.ts
+│ │ ├── label/ # Phase 2: GenAI labeling
+│ │ │ ├── annotate.ts # generateObject + OpenRouter per paragraph
+│ │ │ ├── batch.ts # Concurrency control + JSONL checkpointing
+│ │ │ ├── consensus.ts # Stage 1 majority vote logic
+│ │ │ ├── judge.ts # Stage 2 tiebreaker (Sonnet 4.6)
+│ │ │ ├── prompts.ts # System/user prompt builders
+│ │ │ └── cost.ts # Cost tracking aggregation
+│ │ ├── gold/ # Phase 3: Gold set
+│ │ │ ├── sample.ts # Stratified sampling
+│ │ │ ├── human-label.ts # Human label import
+│ │ │ └── agreement.ts # Krippendorff's alpha, Cohen's kappa
+│ │ ├── benchmark/ # Phase 4: GenAI benchmarking
+│ │ │ ├── run.ts
+│ │ │ └── metrics.ts # F1, AUC, MCC computation
+│ │ ├── lib/ # Shared utilities
+│ │ │ ├── openrouter.ts # Singleton + model registry with pricing
+│ │ │ ├── jsonl.ts # Read/write/append JSONL
+│ │ │ ├── checkpoint.ts # Resume from last completed ID
+│ │ │ └── retry.ts # Exponential backoff
+│ │ └── cli.ts # CLI entry point
+│ └── tests/
+│
+├── python/ # Python: training, DAPT, inference
+│ ├── pyproject.toml
+│ ├── configs/
+│ │ ├── dapt/modernbert-large.yaml
+│ │ ├── finetune/
+│ │ │ ├── modernbert-large.yaml
+│ │ │ ├── neobert.yaml
+│ │ │ └── deberta-v3-large.yaml
+│ │ └── decoder/qwen3.5-lora.yaml
+│ ├── src/
+│ │ ├── dapt/train_mlm.py
+│ │ ├── finetune/
+│ │ │ ├── model.py # Multi-head classifier (shared backbone)
+│ │ │ ├── train.py # HF Trainer script with --time-budget
+│ │ │ ├── data.py
+│ │ │ ├── losses.py # SCL + ordinal + multi-head balancing
+│ │ │ └── trainer.py # Custom Trainer subclass
+│ │ ├── decoder/train_lora.py # Unsloth
+│ │ └── eval/
+│ │ ├── predict.py
+│ │ ├── metrics.py
+│ │ └── error_analysis.py
+│ └── program.md # Autoresearch agent directive
+│
+├── data/ # Gitignored heavy files
+│ ├── raw/{10k,8k}/
+│ ├── extracted/{item1c,item105}/
+│ ├── paragraphs/paragraphs.jsonl
+│ ├── annotations/
+│ │ ├── stage1/{model}.jsonl
+│ │ ├── stage2/judge.jsonl
+│ │ └── consensus.jsonl
+│ ├── gold/
+│ │ ├── gold-sample.jsonl
+│ │ ├── human-labels/annotator-{1,2,3}.jsonl
+│ │ └── gold-adjudicated.jsonl
+│ ├── benchmark/runs/{model}.jsonl
+│ ├── splits/{train,val,test}.jsonl
+│ └── dapt-corpus/sec-texts.jsonl
+│
+├── models/ # Gitignored checkpoints
+├── results/
+│ ├── experiments.tsv # Autoresearch log
+│ └── figures/
+└── .gitignore
+```
+
+---
+
+## Core Schemas (Zod)
+
+**`label.ts`** — the contract passed to `generateObject`:
+```typescript
+export const ContentCategory = z.enum([
+ "Board Governance", "Management Role", "Risk Management Process",
+ "Third-Party Risk", "Incident Disclosure", "Strategy Integration", "None/Other",
+]);
+export const SpecificityLevel = z.union([z.literal(1), z.literal(2), z.literal(3), z.literal(4)]);
+export const LabelOutput = z.object({
+ content_category: ContentCategory,
+ specificity_level: SpecificityLevel,
+ reasoning: z.string().max(500),
+});
+```
+
+**`annotation.ts`** — label + full provenance:
+```typescript
+export const Annotation = z.object({
+ paragraphId: z.string().uuid(),
+ label: LabelOutput,
+ provenance: z.object({
+ modelId: z.string(),
+ provider: z.string(),
+ stage: z.enum(["stage1", "stage2-judge"]),
+ runId: z.string().uuid(),
+ promptVersion: z.string(),
+ inputTokens: z.number(),
+ outputTokens: z.number(),
+ estimatedCostUsd: z.number(),
+ latencyMs: z.number(),
+ requestedAt: z.string().datetime(),
+ }),
+});
+```
+
+**`consensus.ts`** — multi-model agreement:
+```typescript
+export const ConsensusResult = z.object({
+ paragraphId: z.string().uuid(),
+ finalLabel: LabelOutput,
+ method: z.enum(["unanimous", "majority", "judge-resolved", "unresolved"]),
+ categoryAgreement: z.object({ votes: z.record(z.number()), agreed: z.boolean() }),
+ specificityAgreement: z.object({ votes: z.record(z.number()), agreed: z.boolean(), spread: z.number() }),
+ stage1ModelIds: z.array(z.string()),
+ stage2JudgeModelId: z.string().nullable(),
+ confidence: z.number().min(0).max(1),
+});
+```
+
+Full schemas for filing, paragraph, gold, benchmark, and experiment types follow the same pattern — see the full plan agent output for complete definitions.
+
+---
+
+## Data Flow
+
+```
+Phase 1: EXTRACTION (GPU-free)
+ EDGAR API → download 10-K/8-K → parse Item 1C/1.05 → segment into paragraphs
+ → enrich with company metadata → data/paragraphs/paragraphs.jsonl (~50-70K records)
+
+Phase 2: LABELING (GPU-free)
+ paragraphs.jsonl → Stage 1: 3 models annotate all → consensus (expect ~83% agree)
+ → disagreements → Stage 2: Sonnet 4.6 judges → final consensus.jsonl
+
+Phase 3: GOLD SET (GPU-free)
+ Stratified sample 1,200 → 3 humans label independently → compute agreement
+ → adjudicate → gold-adjudicated.jsonl (LOCKED holdout)
+
+Phase 4: BENCHMARKING (GPU-free)
+ Run 6+ models on holdout → compute F1/AUC/MCC/Krippendorff's α → comparison table
+
+Phase 5: TRAINING (REQUIRES GPU)
+ DAPT: SEC-ModernBERT-large (continued MLM on SEC filings)
+ Encoder FT: SEC-ModernBERT, ModernBERT, NeoBERT, DeBERTa (5 ablations)
+ Decoder FT: Qwen3.5 via Unsloth LoRA
+ HP search: autoresearch program.md — agent iterates autonomously
+
+Phase 6: EVALUATION (REQUIRES GPU)
+ Inference on holdout → metrics → error analysis → validity tests → final comparison
+```
+
+---
+
+## Key Architecture Patterns
+
+### Annotation: `generateObject` + OpenRouter
+```typescript
+const result = await generateObject({
+ model: openrouter(modelId),
+ schema: LabelOutput,
+ system: buildSystemPrompt(),
+ prompt: buildUserPrompt(paragraph),
+ temperature: 0,
+ mode: "json",
+});
+```
+
+### Batch Processing: Append-per-record checkpoint
+Each successful annotation appends immediately to JSONL. On crash/resume, read completed IDs from output file, skip them. Uses `p-limit` for concurrency control (default 5).
+
+### Consensus: Stage 1 majority → Stage 2 judge
+- Stage 1: 3 models vote. If 2/3 agree on BOTH dimensions → consensus.
+- Stage 2: For disagreements, Sonnet 4.6 gets the paragraph + all 3 annotations (randomized order for anti-bias). Judge's label treated as authoritative tiebreaker.
+
+### Training: Multi-head classifier
+Shared encoder backbone (ModernBERT/NeoBERT/DeBERTa) → dropout → two linear heads:
+- `category_head`: 7-class softmax
+- `specificity_head`: 4-class ordinal/softmax
+Loss: `α * CE(category) + (1-α) * CE(specificity) + β * SCL`
+
+### HP Search: Autoresearch `program.md`
+- Fixed 30-min time budget per experiment
+- Metric: `val_macro_f1`
+- Agent modifies ONLY YAML configs, not training scripts
+- TSV results log: experiment_id, metric, hyperparameters, verdict (keep/discard)
+- Vary ONE hyperparameter per experiment (controlled ablation)
+
+---
+
+## Quality Gates
+
+| Gate | When | Key Check | Threshold | If Failed |
+|------|------|-----------|-----------|-----------|
+| Extraction QA | After Phase 1 | Spot-check 20 filings manually | 18/20 correct | Fix parser |
+| Labeling Pilot | 50 paragraphs | Human review of LLM labels | ≥80% agreement | Revise prompt/rubric |
+| Scale Pilot | 200 paragraphs | Inter-model Fleiss' Kappa | ≥0.60 | Replace weakest model or revise prompt |
+| Human Labeling | Phase 3 | Krippendorff's α (specificity) | ≥0.67 | Collapse 4-pt to 3-pt scale |
+| Human Labeling | Phase 3 | Cohen's κ (category) | ≥0.75 | Revise rubric boundaries |
+| DAPT | Phase 5 | Perplexity decrease + GLUE check | PPL ↓, GLUE drop <2% | Reduce LR |
+| Fine-tuning | Phase 5 | val_macro_f1 by epoch 3 | >0.75 | Check data quality |
+| Final | Phase 6 | Holdout macro-F1 (category) | ≥0.80 | Error analysis, iterate |
+| Final | Phase 6 | Calibration (ECE) | <0.10 | Temperature scaling |
+
+---
+
+## CLI Commands
+
+```bash
+# Extraction
+bun sec extract:download-10k --fiscal-year 2023
+bun sec extract:parse --type 10k
+bun sec extract:segment
+bun sec extract:metadata
+
+# Labeling
+bun sec label:annotate --model openai/gpt-oss-120b --limit 50 # pilot
+bun sec label:annotate-all # full run
+bun sec label:consensus
+bun sec label:judge
+bun sec label:cost
+
+# Gold set
+bun sec gold:sample --n 1200
+bun sec gold:import-human --annotator annotator-1 --input labels.csv
+bun sec gold:agreement
+
+# Benchmarking
+bun sec benchmark:run-all
+bun sec benchmark:evaluate
+bun sec benchmark:table
+
+# Splits
+bun sec splits:create
+
+# Python training (GPU required)
+uv run python/src/dapt/train_mlm.py --config python/configs/dapt/modernbert-large.yaml
+uv run python/src/finetune/train.py --config python/configs/finetune/modernbert-large.yaml --time-budget 1800
+uv run python/src/decoder/train_lora.py --config python/configs/decoder/qwen3.5-lora.yaml
+uv run python/src/eval/predict.py --split test
+uv run python/src/eval/metrics.py
+```
+
+---
+
+## Implementation Sequence
+
+### Day 1 (GPU-free) — Foundation
+1. `bun init` in ts/, `uv init` in python/, create full directory tree
+2. All Zod schemas
+3. JSONL utilities, OpenRouter singleton, model registry
+4. Prompt builders (from LABELING-CODEBOOK.md)
+5. `annotate.ts` + `batch.ts` with checkpoint/resume
+6. Test: dry-run 3 paragraphs
+
+### Day 2 (GPU-free) — Extraction + Labeling Pilot
+7. EDGAR extraction pipeline (download, parse, segment)
+8. Run extraction on a small sample (~100 filings)
+9. **Quality Gate 1**: Verify extraction
+10. Labeling pilot: 50 paragraphs × 3 models
+11. `consensus.ts` + `judge.ts`
+12. **Quality Gate 2**: Manual review
+13. Scale pilot: 200 paragraphs
+14. **Quality Gate 3**: Inter-model agreement
+15. If gates pass → launch full Stage 1 annotation
+
+### Day 3+ (GPU-free, labeling runs) — Gold Set + Benchmarking
+16. Gold set sampling, human label infrastructure
+17. Benchmark runner + metrics
+18. Consensus + judge on full corpus
+19. Begin human labeling
+20. Prepare DAPT corpus
+
+### GPU Available — Training
+21. Python training scripts (model.py, train.py, losses.py)
+22. `program.md` for autoresearch
+23. DAPT (~2-3 days)
+24. Fine-tuning ablations via autoresearch
+25. Unsloth decoder experiment
+26. Final evaluation + error analysis
+
+---
+
+## Verification
+
+After implementation, verify end-to-end:
+1. `bun sec extract:segment --limit 10` produces valid Paragraph JSONL
+2. `bun sec label:annotate --model openai/gpt-oss-120b --limit 5` returns valid Annotations with cost tracking
+3. `bun sec label:consensus` correctly identifies agreement/disagreement
+4. `bun sec validate:schema --input data/annotations/stage1/gpt-oss-120b.jsonl --schema annotation` passes
+5. Python training script loads JSONL splits and begins training without errors
+6. `results/experiments.tsv` gets populated after one autoresearch iteration
diff --git a/python/.gitignore b/python/.gitignore
new file mode 100644
index 0000000..505a3b1
--- /dev/null
+++ b/python/.gitignore
@@ -0,0 +1,10 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
diff --git a/python/.python-version b/python/.python-version
new file mode 100644
index 0000000..24ee5b1
--- /dev/null
+++ b/python/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/python/main.py b/python/main.py
new file mode 100644
index 0000000..7162f9a
--- /dev/null
+++ b/python/main.py
@@ -0,0 +1,6 @@
+def main():
+ print("Hello from sec-cybert-train!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..c97cbd8
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "sec-cybert-train"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = []
diff --git a/ts/.gitignore b/ts/.gitignore
new file mode 100644
index 0000000..a14702c
--- /dev/null
+++ b/ts/.gitignore
@@ -0,0 +1,34 @@
+# dependencies (bun install)
+node_modules
+
+# output
+out
+dist
+*.tgz
+
+# code coverage
+coverage
+*.lcov
+
+# logs
+logs
+_.log
+report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
+
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# caches
+.eslintcache
+.cache
+*.tsbuildinfo
+
+# IntelliJ based IDEs
+.idea
+
+# Finder (MacOS) folder config
+.DS_Store
diff --git a/ts/package.json b/ts/package.json
new file mode 100644
index 0000000..b25d4b1
--- /dev/null
+++ b/ts/package.json
@@ -0,0 +1,25 @@
+{
+ "name": "sec-cybert",
+ "module": "src/cli.ts",
+ "type": "module",
+ "private": true,
+ "devDependencies": {
+ "@types/bun": "latest",
+ "@types/uuid": "^11.0.0"
+ },
+ "scripts": {
+ "sec": "bun run src/cli.ts",
+ "typecheck": "bunx tsc --noEmit"
+ },
+ "peerDependencies": {
+ "typescript": "^5"
+ },
+ "dependencies": {
+ "@openrouter/ai-sdk-provider": "^2.3.3",
+ "ai": "^6.0.141",
+ "cheerio": "^1.2.0",
+ "p-limit": "^7.3.0",
+ "uuid": "^13.0.0",
+ "zod": "^4.3.6"
+ }
+}
diff --git a/ts/scripts/dispute-crosstab.ts b/ts/scripts/dispute-crosstab.ts
new file mode 100644
index 0000000..c1beb73
--- /dev/null
+++ b/ts/scripts/dispute-crosstab.ts
@@ -0,0 +1,501 @@
+/**
+ * Detailed cross-tabulations for disputed (non-unanimous) paragraphs.
+ *
+ * Usage: bun ts/scripts/dispute-crosstab.ts
+ */
+import { readJsonlRaw, readJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+
+const ANN_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+const PARA_PATH = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
+
+interface Ann {
+ paragraphId: string;
+ label: {
+ content_category: string;
+ specificity_level: number;
+ category_confidence: string;
+ specificity_confidence: string;
+ reasoning: string;
+ };
+ provenance: {
+ modelId: string;
+ costUsd: number;
+ inputTokens: number;
+ outputTokens: number;
+ reasoningTokens: number;
+ latencyMs: number;
+ requestedAt: string;
+ };
+}
+
+// ── Helpers ────────────────────────────────────────────────────────────
+function pct(n: number, total: number): string {
+ if (total === 0) return "0.0%";
+ return `${((n / total) * 100).toFixed(1)}%`;
+}
+
+function median(arr: number[]): number {
+ if (arr.length === 0) return 0;
+ const sorted = [...arr].sort((a, b) => a - b);
+ const mid = Math.floor(sorted.length / 2);
+ return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
+}
+
+function percentile(arr: number[], p: number): number {
+ if (arr.length === 0) return 0;
+ const sorted = [...arr].sort((a, b) => a - b);
+ const idx = (p / 100) * (sorted.length - 1);
+ const lo = Math.floor(idx);
+ const hi = Math.ceil(idx);
+ return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
+}
+
+function majority(arr: T[]): T | null {
+ const freq = new Map();
+ for (const v of arr) freq.set(v, (freq.get(v) ?? 0) + 1);
+ for (const [val, count] of freq) {
+ if (count >= 2) return val;
+ }
+ return null;
+}
+
+function sortedVals(arr: number[]): string {
+ return `[${[...arr].sort((a, b) => a - b).join(",")}]`;
+}
+
+function uniqueSorted(arr: string[]): string[] {
+ return [...new Set(arr)].sort();
+}
+
+// ── Main ──────────────────────────────────────────────────────────────
+async function main() {
+ console.log("Loading data...");
+ const [{ records: rawAnns, skipped: annSkipped }, { records: paragraphs, skipped: paraSkipped }] =
+ await Promise.all([
+ readJsonlRaw(ANN_PATH),
+ readJsonl(PARA_PATH, Paragraph),
+ ]);
+
+ const anns = rawAnns as Ann[];
+ console.log(` ${anns.length.toLocaleString()} annotations (${annSkipped} skipped)`);
+ console.log(` ${paragraphs.length.toLocaleString()} paragraphs (${paraSkipped} skipped)\n`);
+
+ // Index paragraphs by id
+ const paraById = new Map(paragraphs.map(p => [p.id, p]));
+
+ // Group annotations by paragraph
+ const byParagraph = new Map();
+ for (const a of anns) {
+ let arr = byParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+
+ // Classify each paragraph
+ interface ParaInfo {
+ pid: string;
+ cats: string[];
+ specs: number[];
+ catUnanimous: boolean;
+ specUnanimous: boolean;
+ majCat: string | null;
+ majSpec: number | null;
+ catDisputed: boolean;
+ specDisputed: boolean;
+ disputeType: "none" | "cat-only" | "spec-only" | "both";
+ wordCount: number;
+ }
+
+ const allParas: ParaInfo[] = [];
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+ const cats = panns.map(a => a.label.content_category);
+ const specs = panns.map(a => a.label.specificity_level);
+ const catU = new Set(cats).size === 1;
+ const specU = new Set(specs).size === 1;
+ const majCat = majority(cats);
+ const majSpec = majority(specs);
+ const catDisputed = !catU;
+ const specDisputed = !specU;
+ let disputeType: ParaInfo["disputeType"] = "none";
+ if (catDisputed && !specDisputed) disputeType = "cat-only";
+ else if (!catDisputed && specDisputed) disputeType = "spec-only";
+ else if (catDisputed && specDisputed) disputeType = "both";
+
+ const para = paraById.get(pid);
+ allParas.push({
+ pid,
+ cats,
+ specs,
+ catUnanimous: catU,
+ specUnanimous: specU,
+ majCat,
+ majSpec,
+ catDisputed,
+ specDisputed,
+ disputeType,
+ wordCount: para?.wordCount ?? 0,
+ });
+ }
+
+ const disputed = allParas.filter(p => p.disputeType !== "none");
+ const catOnly = allParas.filter(p => p.disputeType === "cat-only");
+ const specOnly = allParas.filter(p => p.disputeType === "spec-only");
+ const bothDisputed = allParas.filter(p => p.disputeType === "both");
+
+ console.log("═══════════════════════════════════════════════════════════════════");
+ console.log(" DISPUTE CROSS-TABULATION ANALYSIS");
+ console.log("═══════════════════════════════════════════════════════════════════");
+ console.log(` Total paragraphs (3-annotator): ${allParas.length.toLocaleString()}`);
+ console.log(` Disputed (not both-unanimous): ${disputed.length.toLocaleString()} (${pct(disputed.length, allParas.length)})`);
+ console.log(` Cat-only: ${catOnly.length.toLocaleString()}`);
+ console.log(` Spec-only: ${specOnly.length.toLocaleString()}`);
+ console.log(` Both: ${bothDisputed.length.toLocaleString()}`);
+
+ // ════════════════════════════════════════════════════════════════════════
+ // 1. CATEGORY x SPECIFICITY CROSS-TAB FOR DISPUTED PARAGRAPHS
+ // ════════════════════════════════════════════════════════════════════════
+ console.log("\n\n══════════════════════════════════════════════════════════════");
+ console.log(" 1. CATEGORY x SPECIFICITY CROSS-TAB (disputed paragraphs)");
+ console.log("══════════════════════════════════════════════════════════════");
+ console.log(" Uses majority-vote labels for both axes.\n");
+
+ // Collect all categories from majority votes
+ const catCounts = new Map();
+ for (const p of disputed) {
+ if (p.majCat) catCounts.set(p.majCat, (catCounts.get(p.majCat) ?? 0) + 1);
+ }
+ const categories = [...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([c]) => c);
+ const specLevels = [1, 2, 3, 4];
+ const specLabels = ["GenBoiler", "SectorAdpt", "FirmSpec", "QuantVerif"];
+
+ // Build the cross-tab
+ const crossTab = new Map();
+ let noMajCat = 0, noMajSpec = 0;
+ for (const p of disputed) {
+ if (!p.majCat || p.majSpec === null) {
+ if (!p.majCat) noMajCat++;
+ if (p.majSpec === null) noMajSpec++;
+ continue;
+ }
+ const key = `${p.majCat}|${p.majSpec}`;
+ crossTab.set(key, (crossTab.get(key) ?? 0) + 1);
+ }
+
+ // Print matrix
+ const colW = 12;
+ const catW = 28;
+ let header = "Category".padEnd(catW);
+ for (let i = 0; i < specLevels.length; i++) {
+ header += `${specLevels[i]}:${specLabels[i]}`.padStart(colW);
+ }
+ header += "Total".padStart(colW);
+ console.log(` ${header}`);
+ console.log(` ${"─".repeat(header.length)}`);
+
+ for (const cat of categories) {
+ let rowTotal = 0;
+ const cells: string[] = [];
+ for (const s of specLevels) {
+ const v = crossTab.get(`${cat}|${s}`) ?? 0;
+ rowTotal += v;
+ cells.push(v.toString());
+ }
+ let row = cat.padEnd(catW);
+ for (let i = 0; i < cells.length; i++) {
+ const v = parseInt(cells[i]);
+ const rowPct = rowTotal > 0 ? ((v / rowTotal) * 100).toFixed(0) : "0";
+ row += `${v} (${rowPct}%)`.padStart(colW);
+ }
+ row += `${rowTotal}`.padStart(colW);
+ console.log(` ${row}`);
+ }
+ console.log(`\n (${noMajCat} paragraphs had no majority category, ${noMajSpec} had no majority specificity)`);
+
+ // ════════════════════════════════════════════════════════════════════════
+ // 2. DISPUTE TYPE BY CATEGORY
+ // ════════════════════════════════════════════════════════════════════════
+ console.log("\n\n══════════════════════════════════════════════════════════════");
+ console.log(" 2. DISPUTE TYPE BY MAJORITY CATEGORY");
+ console.log("══════════════════════════════════════════════════════════════");
+ console.log(" For each majority category, % of disputes that are cat-only, spec-only, or both.\n");
+
+ // Group disputed paragraphs by majority category
+ const disputeByCat = new Map();
+ for (const p of disputed) {
+ const cat = p.majCat ?? "[no majority]";
+ if (!disputeByCat.has(cat)) disputeByCat.set(cat, { catOnly: 0, specOnly: 0, both: 0 });
+ const entry = disputeByCat.get(cat)!;
+ if (p.disputeType === "cat-only") entry.catOnly++;
+ else if (p.disputeType === "spec-only") entry.specOnly++;
+ else if (p.disputeType === "both") entry.both++;
+ }
+
+ const dHeader = "Category".padEnd(catW) + "n".padStart(8) + "Cat-only".padStart(12) + "Spec-only".padStart(12) + "Both".padStart(12);
+ console.log(` ${dHeader}`);
+ console.log(` ${"─".repeat(dHeader.length)}`);
+
+ const sortedDispCats = [...disputeByCat.entries()].sort((a, b) => {
+ const totalA = a[1].catOnly + a[1].specOnly + a[1].both;
+ const totalB = b[1].catOnly + b[1].specOnly + b[1].both;
+ return totalB - totalA;
+ });
+ for (const [cat, d] of sortedDispCats) {
+ const total = d.catOnly + d.specOnly + d.both;
+ const row = cat.padEnd(catW) +
+ total.toString().padStart(8) +
+ `${d.catOnly} (${pct(d.catOnly, total)})`.padStart(12) +
+ `${d.specOnly} (${pct(d.specOnly, total)})`.padStart(12) +
+ `${d.both} (${pct(d.both, total)})`.padStart(12);
+ console.log(` ${row}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════════
+ // 3. SPEC BOUNDARY DISPUTES BY CATEGORY
+ // ════════════════════════════════════════════════════════════════════════
+ console.log("\n\n══════════════════════════════════════════════════════════════");
+ console.log(" 3. SPECIFICITY BOUNDARY DISPUTES BY CATEGORY");
+ console.log("══════════════════════════════════════════════════════════════");
+ console.log(" For spec-disputed paragraphs, the spec vote pattern by majority category.\n");
+
+ // Group by majority category, then count spec patterns
+ const specPatternByCat = new Map>();
+ const specDisputed = allParas.filter(p => p.specDisputed);
+ for (const p of specDisputed) {
+ const cat = p.majCat ?? "[no majority]";
+ if (!specPatternByCat.has(cat)) specPatternByCat.set(cat, new Map());
+ const patternMap = specPatternByCat.get(cat)!;
+ // Show the unique values sorted as the boundary pattern
+ const uniqSorted = [...new Set(p.specs)].sort((a, b) => a - b);
+ const pattern = `[${uniqSorted.join(",")}]`;
+ patternMap.set(pattern, (patternMap.get(pattern) ?? 0) + 1);
+ }
+
+ // Collect all patterns
+ const allPatterns = new Set();
+ for (const pm of specPatternByCat.values()) {
+ for (const pat of pm.keys()) allPatterns.add(pat);
+ }
+ const sortedPatterns = [...allPatterns].sort();
+
+ // Print header
+ const patW = 10;
+ let pHeader = "Category".padEnd(catW) + "n".padStart(6);
+ for (const pat of sortedPatterns) {
+ pHeader += pat.padStart(patW);
+ }
+ console.log(` ${pHeader}`);
+ console.log(` ${"─".repeat(pHeader.length)}`);
+
+ const specPatCats = [...specPatternByCat.entries()].sort((a, b) => {
+ let totalA = 0, totalB = 0;
+ for (const v of a[1].values()) totalA += v;
+ for (const v of b[1].values()) totalB += v;
+ return totalB - totalA;
+ });
+ for (const [cat, pm] of specPatCats) {
+ let total = 0;
+ for (const v of pm.values()) total += v;
+ let row = cat.padEnd(catW) + total.toString().padStart(6);
+ for (const pat of sortedPatterns) {
+ const v = pm.get(pat) ?? 0;
+ if (v === 0) {
+ row += "-".padStart(patW);
+ } else {
+ row += `${v}`.padStart(patW);
+ }
+ }
+ console.log(` ${row}`);
+ }
+
+ // Also show with percentages within each category
+ console.log("\n (Row percentages:)");
+ let pHeader2 = "Category".padEnd(catW) + "n".padStart(6);
+ for (const pat of sortedPatterns) {
+ pHeader2 += pat.padStart(patW);
+ }
+ console.log(` ${pHeader2}`);
+ console.log(` ${"─".repeat(pHeader2.length)}`);
+ for (const [cat, pm] of specPatCats) {
+ let total = 0;
+ for (const v of pm.values()) total += v;
+ let row = cat.padEnd(catW) + total.toString().padStart(6);
+ for (const pat of sortedPatterns) {
+ const v = pm.get(pat) ?? 0;
+ if (v === 0) {
+ row += "-".padStart(patW);
+ } else {
+ row += `${((v / total) * 100).toFixed(0)}%`.padStart(patW);
+ }
+ }
+ console.log(` ${row}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════════
+ // 4. WORD COUNT DISTRIBUTION BY DISPUTE TYPE
+ // ════════════════════════════════════════════════════════════════════════
+ console.log("\n\n══════════════════════════════════════════════════════════════");
+ console.log(" 4. WORD COUNT DISTRIBUTION BY DISPUTE TYPE");
+ console.log("══════════════════════════════════════════════════════════════\n");
+
+ const groups: { label: string; paras: ParaInfo[] }[] = [
+ { label: "Unanimous (no dispute)", paras: allParas.filter(p => p.disputeType === "none") },
+ { label: "Cat-only dispute", paras: catOnly },
+ { label: "Spec-only dispute", paras: specOnly },
+ { label: "Both disputed", paras: bothDisputed },
+ ];
+
+ const wcHeader = "Dispute Type".padEnd(28) + "n".padStart(8) + "Median".padStart(10) + "P90".padStart(10) + "P10".padStart(10) + "Mean".padStart(10);
+ console.log(` ${wcHeader}`);
+ console.log(` ${"─".repeat(wcHeader.length)}`);
+
+ for (const g of groups) {
+ const wcs = g.paras.map(p => p.wordCount).filter(w => w > 0);
+ if (wcs.length === 0) continue;
+ const row = g.label.padEnd(28) +
+ wcs.length.toString().padStart(8) +
+ median(wcs).toFixed(0).padStart(10) +
+ percentile(wcs, 90).toFixed(0).padStart(10) +
+ percentile(wcs, 10).toFixed(0).padStart(10) +
+ (wcs.reduce((a, b) => a + b, 0) / wcs.length).toFixed(0).padStart(10);
+ console.log(` ${row}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════════
+ // 5. UNRESOLVED PARAGRAPH ANALYSIS (3-WAY SPLITS)
+ // ════════════════════════════════════════════════════════════════════════
+ console.log("\n\n══════════════════════════════════════════════════════════════");
+ console.log(" 5. UNRESOLVED PARAGRAPH ANALYSIS (3-way category splits)");
+ console.log("══════════════════════════════════════════════════════════════\n");
+
+ const unresolved = allParas.filter(p => p.majCat === null);
+ console.log(` Total unresolved paragraphs: ${unresolved.length.toLocaleString()}`);
+
+ // Category representations in unresolved
+ const unresolvedCatFreq = new Map();
+ for (const p of unresolved) {
+ for (const c of p.cats) {
+ unresolvedCatFreq.set(c, (unresolvedCatFreq.get(c) ?? 0) + 1);
+ }
+ }
+ console.log("\n Categories appearing in unresolved paragraphs (annotation count):");
+ const sortedUnresCats = [...unresolvedCatFreq.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [cat, count] of sortedUnresCats) {
+ console.log(` ${count.toString().padStart(6)} ${cat}`);
+ }
+
+ // Specificity in unresolved
+ const unresolvedSpecFreq = new Map();
+ for (const p of unresolved) {
+ for (const s of p.specs) {
+ unresolvedSpecFreq.set(s, (unresolvedSpecFreq.get(s) ?? 0) + 1);
+ }
+ }
+ console.log("\n Specificity levels in unresolved paragraphs (annotation count):");
+ for (let s = 1; s <= 4; s++) {
+ const count = unresolvedSpecFreq.get(s) ?? 0;
+ console.log(` ${count.toString().padStart(6)} ${s}`);
+ }
+
+ // Most common 3-way category splits
+ const threewayPatterns = new Map();
+ for (const p of unresolved) {
+ const sorted = [...p.cats].sort();
+ const key = sorted.join(" / ");
+ threewayPatterns.set(key, (threewayPatterns.get(key) ?? 0) + 1);
+ }
+
+ console.log("\n Most common 3-way category splits:");
+ const sortedThreeWay = [...threewayPatterns.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [pattern, count] of sortedThreeWay.slice(0, 20)) {
+ console.log(` ${count.toString().padStart(6)} ${pattern}`);
+ }
+ if (sortedThreeWay.length > 20) {
+ console.log(` ... and ${sortedThreeWay.length - 20} more patterns`);
+ }
+
+ // Specificity agreement among unresolved
+ const unresolvedSpecUnanimous = unresolved.filter(p => p.specUnanimous).length;
+ const unresolvedSpecMaj = unresolved.filter(p => p.majSpec !== null).length;
+ console.log(`\n Specificity agreement among unresolved:`);
+ console.log(` Spec unanimous: ${unresolvedSpecUnanimous} (${pct(unresolvedSpecUnanimous, unresolved.length)})`);
+ console.log(` Spec majority: ${unresolvedSpecMaj} (${pct(unresolvedSpecMaj, unresolved.length)})`);
+ console.log(` Spec 3-way: ${unresolved.length - unresolvedSpecMaj} (${pct(unresolved.length - unresolvedSpecMaj, unresolved.length)})`);
+
+ // ════════════════════════════════════════════════════════════════════════
+ // 6. "BOTH" DISPUTES — COMBINED PATTERNS
+ // ════════════════════════════════════════════════════════════════════════
+ console.log("\n\n══════════════════════════════════════════════════════════════");
+ console.log(" 6. 'BOTH' DISPUTES — COMBINED CATEGORY + SPECIFICITY PATTERNS");
+ console.log("══════════════════════════════════════════════════════════════\n");
+
+ console.log(` Total paragraphs with both cat AND spec disputed: ${bothDisputed.length.toLocaleString()}\n`);
+
+ // For each, compute the category dispute pair + spec boundary
+ const combinedPatterns = new Map();
+ for (const p of bothDisputed) {
+ // Category dispute description
+ const catUniq = uniqueSorted(p.cats);
+ let catPart: string;
+ if (catUniq.length === 2) {
+ // 2-1 split: show as "A<->B"
+ catPart = `${catUniq[0]}↔${catUniq[1]}`;
+ } else {
+ // 3-way
+ catPart = catUniq.join("/");
+ }
+
+ // Spec dispute description
+ const specUniq = [...new Set(p.specs)].sort((a, b) => a - b);
+ const specPart = `[${specUniq.join(",")}]`;
+
+ const combined = `${catPart} + ${specPart}`;
+ combinedPatterns.set(combined, (combinedPatterns.get(combined) ?? 0) + 1);
+ }
+
+ const sortedCombined = [...combinedPatterns.entries()].sort((a, b) => b[1] - a[1]);
+ console.log(" Top 30 combined dispute patterns:");
+ for (const [pattern, count] of sortedCombined.slice(0, 30)) {
+ console.log(` ${count.toString().padStart(6)} ${pattern}`);
+ }
+ if (sortedCombined.length > 30) {
+ console.log(`\n ... and ${sortedCombined.length - 30} more patterns (${sortedCombined.slice(30).reduce((a, b) => a + b[1], 0)} paragraphs)`);
+ }
+
+ // Also summarize by just the category pair (aggregating across spec patterns)
+ console.log("\n Category dispute pairs (aggregated across spec patterns):");
+ const catPairAgg = new Map();
+ for (const p of bothDisputed) {
+ const catUniq = uniqueSorted(p.cats);
+ let catPart: string;
+ if (catUniq.length === 2) {
+ catPart = `${catUniq[0]}↔${catUniq[1]}`;
+ } else {
+ catPart = catUniq.join("/");
+ }
+ catPairAgg.set(catPart, (catPairAgg.get(catPart) ?? 0) + 1);
+ }
+ const sortedCatPairs = [...catPairAgg.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [pair, count] of sortedCatPairs.slice(0, 20)) {
+ console.log(` ${count.toString().padStart(6)} ${pair}`);
+ }
+
+ console.log("\n Spec boundary patterns within 'both' disputes:");
+ const specPatAgg = new Map();
+ for (const p of bothDisputed) {
+ const specUniq = [...new Set(p.specs)].sort((a, b) => a - b);
+ const specPart = `[${specUniq.join(",")}]`;
+ specPatAgg.set(specPart, (specPatAgg.get(specPart) ?? 0) + 1);
+ }
+ const sortedSpecPats = [...specPatAgg.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [pat, count] of sortedSpecPats) {
+ console.log(` ${count.toString().padStart(6)} ${pat} (${pct(count, bothDisputed.length)})`);
+ }
+
+ console.log("\n═══════════════════════════════════════════════════════════════════");
+ console.log(" ANALYSIS COMPLETE");
+ console.log("═══════════════════════════════════════════════════════════════════");
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/judge-bench.ts b/ts/scripts/judge-bench.ts
new file mode 100644
index 0000000..c815d47
--- /dev/null
+++ b/ts/scripts/judge-bench.ts
@@ -0,0 +1,455 @@
+/**
+ * Benchmark Stage 2 judge candidates on disagreement paragraphs.
+ * Runs each model as a judge and compares against Stage 1 majority vote.
+ *
+ * Usage: bun ts/scripts/judge-bench.ts [--n 50] [--concurrency 10]
+ */
+import { generateText, tool, Output } from "ai";
+import { openrouter, providerOf } from "../src/lib/openrouter.ts";
+import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { LabelOutputRaw, toLabelOutput } from "../src/schemas/label.ts";
+import { SYSTEM_PROMPT, buildJudgePrompt, PROMPT_VERSION } from "../src/label/prompts.ts";
+import { withRetry } from "../src/lib/retry.ts";
+import { v4 as uuidv4 } from "uuid";
+import { existsSync } from "node:fs";
+import { mkdir } from "node:fs/promises";
+import pLimit from "p-limit";
+
+const args = process.argv.slice(2);
+const MODEL = args.find(a => !a.startsWith("--"))!;
+if (!MODEL) { console.error("Usage: bun ts/scripts/judge-bench.ts "); process.exit(1); }
+
+function flag(name: string): string | undefined {
+ const idx = args.indexOf(`--${name}`);
+ return idx === -1 ? undefined : args[idx + 1];
+}
+const N = parseInt(flag("n") ?? "50", 10);
+const CONCURRENCY = parseInt(flag("concurrency") ?? "10", 10);
+const MODE = (flag("mode") ?? "structured") as "structured" | "tool";
+const shortName = MODEL.split("/").pop()!;
+const slug = MODEL.replace("/", "_");
+
+const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+const PARAGRAPHS_PATH = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
+const BENCH_DIR = new URL("../../data/bench/judges", import.meta.url).pathname;
+const SAMPLE_PATH = `${BENCH_DIR}/judge-sample.jsonl`;
+const OUTPUT_PATH = `${BENCH_DIR}/${slug}.jsonl`;
+
+if (!existsSync(BENCH_DIR)) await mkdir(BENCH_DIR, { recursive: true });
+
+interface S1Ann {
+ paragraphId: string;
+ label: { content_category: string; specificity_level: number; reasoning: string };
+ provenance: { modelId: string };
+}
+
+function pct(n: number, total: number): string {
+ return `${((n / total) * 100).toFixed(1)}%`;
+}
+
+async function main() {
+ // ── Load Stage 1 annotations ────────────────────────────────────────
+ console.error(`[${shortName}] Loading Stage 1 data...`);
+ const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
+ const s1ByParagraph = new Map();
+ for (const raw of allAnns) {
+ const a = raw as S1Ann;
+ let arr = s1ByParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+
+ // ── Find disagreement paragraphs ────────────────────────────────────
+ const disagreementIds: string[] = [];
+ for (const [pid, anns] of s1ByParagraph) {
+ if (anns.length !== 3) continue;
+ const cats = new Set(anns.map(a => a.label.content_category));
+ const specs = new Set(anns.map(a => a.label.specificity_level));
+ if (cats.size > 1 || specs.size > 1) {
+ disagreementIds.push(pid);
+ }
+ }
+ console.error(`[${shortName}] ${disagreementIds.length.toLocaleString()} disagreement paragraphs total`);
+
+ // ── Load or create stable sample ────────────────────────────────────
+ let sampleIds: string[];
+ if (existsSync(SAMPLE_PATH)) {
+ const { records } = await readJsonlRaw(SAMPLE_PATH);
+ sampleIds = (records as { id: string }[]).map(r => r.id);
+ console.error(`[${shortName}] Using existing sample of ${sampleIds.length} paragraphs`);
+ } else {
+ // Seeded shuffle for reproducibility
+ const seed = 42;
+ let rng = seed;
+ const nextRng = () => { rng = (rng * 1664525 + 1013904223) & 0x7fffffff; return rng / 0x7fffffff; };
+ const shuffled = [...disagreementIds];
+ for (let i = shuffled.length - 1; i > 0; i--) {
+ const j = Math.floor(nextRng() * (i + 1));
+ [shuffled[i], shuffled[j]] = [shuffled[j]!, shuffled[i]!];
+ }
+ sampleIds = shuffled.slice(0, N);
+ // Save stable sample
+ for (const id of sampleIds) {
+ await appendJsonl(SAMPLE_PATH, { id });
+ }
+ console.error(`[${shortName}] Created new sample of ${sampleIds.length} paragraphs`);
+ }
+
+ // ── Load paragraph texts ────────────────────────────────────────────
+ console.error(`[${shortName}] Loading paragraph texts...`);
+ const { records: allParagraphs } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
+ const paragraphMap = new Map(allParagraphs.map(p => [p.id, p]));
+
+ // ── Resume support ──────────────────────────────────────────────────
+ const doneKeys = new Set();
+ if (existsSync(OUTPUT_PATH)) {
+ const { records: existing } = await readJsonlRaw(OUTPUT_PATH);
+ for (const r of existing) {
+ const a = r as { paragraphId?: string };
+ if (a.paragraphId) doneKeys.add(a.paragraphId);
+ }
+ if (doneKeys.size > 0) console.error(`[${shortName}] Resuming: ${doneKeys.size} already done`);
+ }
+
+ const remaining = sampleIds.filter(id => !doneKeys.has(id));
+ if (remaining.length === 0) {
+ console.error(`[${shortName}] All done, skipping to analysis`);
+ } else {
+ console.error(`[${shortName}] Running ${remaining.length} judge calls (concurrency=${CONCURRENCY})...\n`);
+
+ const runId = uuidv4();
+ const limit = pLimit(CONCURRENCY);
+ let completed = 0, failed = 0, totalCost = 0;
+ const startTime = Date.now();
+
+ const tasks = remaining.map(pid => limit(async () => {
+ const paragraph = paragraphMap.get(pid);
+ if (!paragraph) { failed++; return; }
+
+ const priorAnns = s1ByParagraph.get(pid)!;
+ const priorForJudge = priorAnns.map(a => ({
+ content_category: a.label.content_category,
+ specificity_level: a.label.specificity_level,
+ reasoning: a.label.reasoning,
+ }));
+
+ const requestedAt = new Date().toISOString();
+ const start = Date.now();
+
+ try {
+ const providerOpts = {
+ openrouter: {
+ reasoning: { effort: "medium" as const },
+ usage: { include: true },
+ provider: { require_parameters: true },
+ },
+ };
+
+ let rawOutput: LabelOutputRaw;
+ let responseId: string;
+ let usage: { inputTokens?: number; outputTokens?: number; outputTokenDetails?: { reasoningTokens?: number }; raw?: { cost?: number } };
+
+ if (MODE === "tool") {
+ const r = await withRetry(
+ () => generateText({
+ model: openrouter(MODEL),
+ system: SYSTEM_PROMPT,
+ prompt: buildJudgePrompt(paragraph, priorForJudge),
+ temperature: 0,
+ tools: {
+ submit_label: tool({
+ description: "Submit your final label for this paragraph",
+ inputSchema: LabelOutputRaw,
+ }),
+ },
+ toolChoice: "required",
+ providerOptions: providerOpts,
+ abortSignal: AbortSignal.timeout(240_000),
+ }),
+ { label: `${shortName}:${pid.slice(0, 8)}` },
+ );
+ const tc = r.toolCalls[0];
+ if (!tc) throw new Error(`No tool call from ${shortName} for ${pid}`);
+ rawOutput = tc.input as LabelOutputRaw;
+ responseId = r.response?.id ?? "unknown";
+ usage = r.usage as typeof usage;
+ } else {
+ const r = await withRetry(
+ () => generateText({
+ model: openrouter(MODEL),
+ output: Output.object({ schema: LabelOutputRaw }),
+ system: SYSTEM_PROMPT,
+ prompt: buildJudgePrompt(paragraph, priorForJudge),
+ temperature: 0,
+ providerOptions: providerOpts,
+ abortSignal: AbortSignal.timeout(240_000),
+ }),
+ { label: `${shortName}:${pid.slice(0, 8)}` },
+ );
+ if (!r.output) throw new Error(`No output from ${shortName} for ${pid}`);
+ rawOutput = r.output;
+ responseId = r.response?.id ?? "unknown";
+ usage = r.usage as typeof usage;
+ }
+
+ const latencyMs = Date.now() - start;
+ const label = toLabelOutput(rawOutput);
+ const costUsd = usage.raw?.cost ?? 0;
+
+ const annotation = {
+ paragraphId: pid,
+ label,
+ provenance: {
+ modelId: MODEL,
+ provider: providerOf(MODEL),
+ generationId: responseId,
+ stage: "stage2-judge" as const,
+ runId,
+ promptVersion: PROMPT_VERSION,
+ inputTokens: usage.inputTokens ?? 0,
+ outputTokens: usage.outputTokens ?? 0,
+ reasoningTokens: usage.outputTokenDetails?.reasoningTokens ?? 0,
+ costUsd,
+ latencyMs,
+ requestedAt,
+ },
+ };
+
+ await appendJsonl(OUTPUT_PATH, annotation);
+ totalCost += costUsd;
+ completed++;
+
+ if (completed % 10 === 0) {
+ process.stderr.write(`\r[${shortName}] ${completed}/${remaining.length} ($${totalCost.toFixed(4)}) `);
+ }
+ } catch (err) {
+ failed++;
+ const msg = err instanceof Error ? err.message : String(err);
+ if (failed <= 3) console.error(`\n[${shortName}] ✖ ${pid.slice(0, 8)}: ${msg.slice(0, 200)}`);
+ }
+ }));
+
+ await Promise.all(tasks);
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
+ console.error(`\n[${shortName}] Done: ${completed} ok, ${failed} failed, $${totalCost.toFixed(4)}, ${elapsed}s`);
+ }
+
+ // ── Analysis ────────────────────────────────────────────────────────
+ const { records: judgeRaw } = await readJsonlRaw(OUTPUT_PATH);
+ const judgeResults = new Map();
+ for (const r of judgeRaw) {
+ const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string }; provenance: { costUsd: number; outputTokens: number; reasoningTokens: number; latencyMs: number } };
+ judgeResults.set(a.paragraphId, { ...a.label, ...a.provenance });
+ }
+
+ const n = judgeResults.size;
+ let totalCost = 0, totalOutput = 0, totalReasoning = 0, totalLatency = 0;
+ for (const v of judgeResults.values()) {
+ totalCost += v.costUsd;
+ totalOutput += v.outputTokens;
+ totalReasoning += v.reasoningTokens;
+ totalLatency += v.latencyMs;
+ }
+
+ console.log(`\n═══ ${shortName} as Judge (n=${n}) ═══`);
+ console.log(` Cost: $${totalCost.toFixed(4)} total, $${(totalCost / n).toFixed(5)}/call`);
+ console.log(` Latency: ${(totalLatency / n).toFixed(0)}ms avg`);
+ console.log(` Output: ${(totalOutput / n).toFixed(0)} tokens avg, ${(totalReasoning / n).toFixed(0)} reasoning avg`);
+ console.log(` Est. full Stage 2 cost (14,623 calls): $${(totalCost / n * 14623).toFixed(0)}`);
+
+ // ── Load gold labels ───────────────────────────────────────────────
+ const GOLD_PATH = `${BENCH_DIR}/gold-final.json`;
+ let goldLabels: Record = {};
+ if (existsSync(GOLD_PATH)) {
+ goldLabels = JSON.parse(await Bun.file(GOLD_PATH).text());
+ console.log(`\n Gold labels loaded: ${Object.keys(goldLabels).length} paragraphs`);
+ } else {
+ console.log(`\n ⚠ No gold labels found at ${GOLD_PATH} — skipping gold comparison`);
+ }
+
+ // ── Compare vs gold labels ─────────────────────────────────────────
+ const hasGold = Object.keys(goldLabels).length > 0;
+ let goldCatMatch = 0, goldSpecMatch = 0, goldBothMatch = 0, goldTotal = 0;
+ let majGoldCatMatch = 0, majGoldSpecMatch = 0, majGoldBothMatch = 0, majGoldTotal = 0;
+
+ // Confidence breakdown vs gold accuracy
+ const confBuckets = { high: { correct: 0, total: 0 }, medium: { correct: 0, total: 0 }, low: { correct: 0, total: 0 } };
+
+ // Per-category accuracy vs gold
+ const catAccuracy = new Map();
+
+ // Confusion matrix for category errors
+ const catConfusions: { gold: string; judge: string }[] = [];
+
+ if (hasGold) {
+ for (const [pid, judgeLabel] of judgeResults) {
+ const gold = goldLabels[pid];
+ if (!gold) continue;
+ goldTotal++;
+
+ const catOk = judgeLabel.content_category === gold.cat;
+ const specOk = judgeLabel.specificity_level === gold.spec;
+ if (catOk) goldCatMatch++;
+ if (specOk) goldSpecMatch++;
+ if (catOk && specOk) goldBothMatch++;
+
+ // Track confidence vs accuracy (use lower of the two confidences)
+ const worstConf = judgeLabel.category_confidence === "low" || judgeLabel.specificity_confidence === "low"
+ ? "low"
+ : judgeLabel.category_confidence === "medium" || judgeLabel.specificity_confidence === "medium"
+ ? "medium"
+ : "high";
+ confBuckets[worstConf].total++;
+ if (catOk && specOk) confBuckets[worstConf].correct++;
+
+ // Per-category
+ if (!catAccuracy.has(gold.cat)) catAccuracy.set(gold.cat, { correct: 0, total: 0 });
+ const ca = catAccuracy.get(gold.cat)!;
+ ca.total++;
+ if (catOk) ca.correct++;
+
+ // Confusion matrix entries for errors
+ if (!catOk) catConfusions.push({ gold: gold.cat, judge: judgeLabel.content_category });
+
+ // Majority vote vs gold
+ const s1anns = s1ByParagraph.get(pid)!;
+ const cats = s1anns.map(a => a.label.content_category);
+ const catFreq = new Map();
+ for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
+ const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ const specs = s1anns.map(a => a.label.specificity_level);
+ const specFreq = new Map();
+ for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
+ const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ majGoldTotal++;
+ if (majCat === gold.cat) majGoldCatMatch++;
+ if (majSpec === gold.spec) majGoldSpecMatch++;
+ if (majCat === gold.cat && majSpec === gold.spec) majGoldBothMatch++;
+ }
+
+ console.log(`\n ── vs GOLD LABELS (n=${goldTotal}) ──`);
+ console.log(` Judge: cat ${pct(goldCatMatch, goldTotal)}, spec ${pct(goldSpecMatch, goldTotal)}, both ${pct(goldBothMatch, goldTotal)}`);
+ console.log(` Majority: cat ${pct(majGoldCatMatch, majGoldTotal)}, spec ${pct(majGoldSpecMatch, majGoldTotal)}, both ${pct(majGoldBothMatch, majGoldTotal)}`);
+ console.log(` Delta: cat +${((goldCatMatch - majGoldCatMatch) / goldTotal * 100).toFixed(1)}pp, spec +${((goldSpecMatch - majGoldSpecMatch) / goldTotal * 100).toFixed(1)}pp, both +${((goldBothMatch - majGoldBothMatch) / goldTotal * 100).toFixed(1)}pp`);
+
+ // Confidence calibration
+ console.log(`\n ── CONFIDENCE CALIBRATION ──`);
+ for (const [level, bucket] of Object.entries(confBuckets)) {
+ if (bucket.total > 0) {
+ console.log(` ${level.padEnd(8)} ${pct(bucket.correct, bucket.total).padStart(6)} both-correct (n=${bucket.total})`);
+ }
+ }
+
+ // Per-category accuracy
+ console.log(`\n ── PER-CATEGORY ACCURACY (vs gold) ──`);
+ for (const [cat, acc] of [...catAccuracy.entries()].sort((a, b) => b[1].total - a[1].total)) {
+ console.log(` ${cat.padEnd(30)} ${pct(acc.correct, acc.total).padStart(6)} (${acc.correct}/${acc.total})`);
+ }
+
+ // Category confusions
+ if (catConfusions.length > 0) {
+ console.log(`\n ── CATEGORY ERRORS (${catConfusions.length} total) ──`);
+ const confusionCounts = new Map();
+ for (const { gold, judge } of catConfusions) {
+ const key = `${gold} → ${judge}`;
+ confusionCounts.set(key, (confusionCounts.get(key) ?? 0) + 1);
+ }
+ for (const [pair, count] of [...confusionCounts.entries()].sort(([, a], [, b]) => b - a)) {
+ console.log(` ${pair}: ${count}`);
+ }
+ }
+ }
+
+ // ── Compare judge vs Stage 1 majority vote ─────────────────────────
+ let agreeMajCat = 0, agreeMajSpec = 0, agreeMajBoth = 0;
+ const modelAgreement = new Map();
+
+ for (const [pid, judgeLabel] of judgeResults) {
+ const s1anns = s1ByParagraph.get(pid)!;
+
+ const cats = s1anns.map(a => a.label.content_category);
+ const catFreq = new Map();
+ for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
+ const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ const specs = s1anns.map(a => a.label.specificity_level);
+ const specFreq = new Map();
+ for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
+ const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ if (majCat && judgeLabel.content_category === majCat) agreeMajCat++;
+ if (majSpec !== undefined && judgeLabel.specificity_level === majSpec) agreeMajSpec++;
+ if (majCat && judgeLabel.content_category === majCat && majSpec !== undefined && judgeLabel.specificity_level === majSpec) agreeMajBoth++;
+
+ for (const s1 of s1anns) {
+ const m = s1.provenance.modelId.split("/").pop()!;
+ if (!modelAgreement.has(m)) modelAgreement.set(m, { cat: 0, spec: 0, total: 0 });
+ const ma = modelAgreement.get(m)!;
+ ma.total++;
+ if (s1.label.content_category === judgeLabel.content_category) ma.cat++;
+ if (s1.label.specificity_level === judgeLabel.specificity_level) ma.spec++;
+ }
+ }
+
+ console.log(`\n ── vs Stage 1 Majority ──`);
+ console.log(` cat ${pct(agreeMajCat, n)}, spec ${pct(agreeMajSpec, n)}, both ${pct(agreeMajBoth, n)}`);
+ console.log(`\n vs Individual Stage 1 models:`);
+ for (const [m, a] of [...modelAgreement.entries()].sort()) {
+ console.log(` × ${m.padEnd(30)} cat ${pct(a.cat, a.total).padStart(6)} spec ${pct(a.spec, a.total).padStart(6)}`);
+ }
+
+ // How often does judge side with outlier vs majority?
+ let sidesMajority = 0, sidesOutlier = 0, sidesNeither = 0;
+ for (const [pid, judgeLabel] of judgeResults) {
+ const s1anns = s1ByParagraph.get(pid)!;
+ const cats = s1anns.map(a => a.label.content_category);
+ const catFreq = new Map();
+ for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
+ const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ if (!majCat) { sidesNeither++; continue; }
+ const outlierCats = cats.filter(c => c !== majCat);
+ if (judgeLabel.content_category === majCat) sidesMajority++;
+ else if (outlierCats.includes(judgeLabel.content_category)) sidesOutlier++;
+ else sidesNeither++;
+ }
+ console.log(`\n Judge category decision pattern:`);
+ console.log(` Sides with majority: ${sidesMajority} (${pct(sidesMajority, n)})`);
+ console.log(` Sides with outlier: ${sidesOutlier} (${pct(sidesOutlier, n)})`);
+ console.log(` Neither (own pick): ${sidesNeither} (${pct(sidesNeither, n)})`);
+
+ // ── Confidence distribution ─────────────────────────────────────────
+ const catConfDist = { high: 0, medium: 0, low: 0 };
+ const specConfDist = { high: 0, medium: 0, low: 0 };
+ for (const v of judgeResults.values()) {
+ catConfDist[v.category_confidence as keyof typeof catConfDist]++;
+ specConfDist[v.specificity_confidence as keyof typeof specConfDist]++;
+ }
+ console.log(`\n ── CONFIDENCE DISTRIBUTION ──`);
+ console.log(` Category: high=${catConfDist.high} medium=${catConfDist.medium} low=${catConfDist.low}`);
+ console.log(` Specificity: high=${specConfDist.high} medium=${specConfDist.medium} low=${specConfDist.low}`);
+
+ // Write report JSON
+ const report = {
+ model: MODEL, shortName, n,
+ totalCost: +totalCost.toFixed(4),
+ costPerCall: +(totalCost / n).toFixed(5),
+ estFullCost: +(totalCost / n * 14623).toFixed(0),
+ avgOutputTokens: +(totalOutput / n).toFixed(0),
+ avgReasoningTokens: +(totalReasoning / n).toFixed(0),
+ avgLatencyMs: +(totalLatency / n).toFixed(0),
+ vsGold: hasGold ? { cat: +(goldCatMatch / goldTotal * 100).toFixed(1), spec: +(goldSpecMatch / goldTotal * 100).toFixed(1), both: +(goldBothMatch / goldTotal * 100).toFixed(1) } : null,
+ vsMajority: { cat: +(agreeMajCat / n * 100).toFixed(1), spec: +(agreeMajSpec / n * 100).toFixed(1), both: +(agreeMajBoth / n * 100).toFixed(1) },
+ majorityVsGold: hasGold ? { cat: +(majGoldCatMatch / majGoldTotal * 100).toFixed(1), spec: +(majGoldSpecMatch / majGoldTotal * 100).toFixed(1), both: +(majGoldBothMatch / majGoldTotal * 100).toFixed(1) } : null,
+ confidenceCalibration: hasGold ? Object.fromEntries(Object.entries(confBuckets).map(([k, v]) => [k, { accuracy: v.total > 0 ? +(v.correct / v.total * 100).toFixed(1) : null, n: v.total }])) : null,
+ sidesMajority: +(sidesMajority / n * 100).toFixed(1),
+ sidesOutlier: +(sidesOutlier / n * 100).toFixed(1),
+ };
+ await Bun.write(`${BENCH_DIR}/${slug}.report.json`, JSON.stringify(report, null, 2) + "\n");
+ console.error(`\n[${shortName}] Report saved`);
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/judge-diag-batch.ts b/ts/scripts/judge-diag-batch.ts
new file mode 100644
index 0000000..241be6f
--- /dev/null
+++ b/ts/scripts/judge-diag-batch.ts
@@ -0,0 +1,114 @@
+/**
+ * Diagnostic: batch-test GLM-5 on failing paragraph IDs.
+ * Runs each 3 times to measure intermittent failure rate.
+ * Usage: bun ts/scripts/judge-diag-batch.ts [model-id]
+ */
+import { generateText, Output } from "ai";
+import { openrouter } from "../src/lib/openrouter.ts";
+import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { LabelOutputRaw } from "../src/schemas/label.ts";
+import { SYSTEM_PROMPT, buildJudgePrompt } from "../src/label/prompts.ts";
+
+const MODEL = process.argv[2] ?? "z-ai/glm-5";
+
+const FAILED_IDS = [
+ "25e44b58-e11a-4633-8efe-c63836862cd9",
+ "282c982b-35bb-4fa9-82e3-41c748aa0c83",
+ "61bcdd6b-cd6b-415e-940a-59c77d8d757a",
+ "66b02dbe-e7aa-4b6e-9fb4-47542f0cd980",
+ "87b4fd8c-a095-4645-8969-5071a97d84b8",
+ "887bc80e-08c5-4337-9a85-1669f8cde071",
+ "c0d77667-1134-4347-a84e-cf640b463d7e",
+ "c34fb56c-9190-4e93-8e75-c322dbb563ae",
+ "c71ebd62-0fef-4ff5-af3f-943c4d7bfdbd",
+];
+
+const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+const PARAGRAPHS_PATH = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
+
+interface S1Ann {
+ paragraphId: string;
+ label: { content_category: string; specificity_level: number; reasoning: string };
+ provenance: { modelId: string };
+}
+
+console.error(`Loading data...`);
+const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
+const s1ByParagraph = new Map();
+for (const raw of allAnns) {
+ const a = raw as S1Ann;
+ let arr = s1ByParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+}
+
+const { records: allParagraphs } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
+const paragraphMap = new Map(allParagraphs.map(p => [p.id, p]));
+
+console.error(`Testing ${FAILED_IDS.length} paragraphs × 2 attempts each...\n`);
+
+const results: { pid: string; attempt: number; success: boolean; modelId?: string; error?: string }[] = [];
+
+for (const pid of FAILED_IDS) {
+ const paragraph = paragraphMap.get(pid);
+ if (!paragraph) { console.error(` Skip ${pid.slice(0, 8)} — not found`); continue; }
+
+ const priorAnns = (s1ByParagraph.get(pid) ?? []).map(a => ({
+ content_category: a.label.content_category,
+ specificity_level: a.label.specificity_level,
+ reasoning: a.label.reasoning,
+ }));
+
+ for (let attempt = 1; attempt <= 2; attempt++) {
+ try {
+ const result = await generateText({
+ model: openrouter(MODEL),
+ output: Output.object({ schema: LabelOutputRaw }),
+ system: SYSTEM_PROMPT,
+ prompt: buildJudgePrompt(paragraph, priorAnns),
+ temperature: 0,
+ providerOptions: {
+ openrouter: {
+ reasoning: { effort: "medium" },
+ usage: { include: true },
+ },
+ },
+ abortSignal: AbortSignal.timeout(120_000),
+ });
+
+ const modelId = result.response?.modelId ?? "unknown";
+ console.log(` ✓ ${pid.slice(0, 8)} #${attempt} — model=${modelId}, cat=${result.output?.content_category}, spec=${result.output?.specificity}`);
+ results.push({ pid, attempt, success: true, modelId });
+ } catch (err) {
+ const msg = err instanceof Error ? err.message.slice(0, 200) : String(err);
+ console.log(` ✗ ${pid.slice(0, 8)} #${attempt} — ${msg}`);
+ results.push({ pid, attempt, success: false, error: msg });
+ }
+ }
+}
+
+const total = results.length;
+const successes = results.filter(r => r.success).length;
+const failures = results.filter(r => !r.success).length;
+console.log(`\n=== SUMMARY ===`);
+console.log(`Total: ${total}, Success: ${successes} (${(successes/total*100).toFixed(0)}%), Failed: ${failures} (${(failures/total*100).toFixed(0)}%)`);
+
+// Group by modelId
+const byModel = new Map();
+for (const r of results.filter(r => r.success)) {
+ byModel.set(r.modelId!, (byModel.get(r.modelId!) ?? 0) + 1);
+}
+console.log(`\nModel IDs seen:`, [...byModel.entries()].map(([m, n]) => `${m} (${n})`).join(", "));
+
+// Per-paragraph failure rate
+const byPid = new Map();
+for (const r of results) {
+ if (!byPid.has(r.pid)) byPid.set(r.pid, { ok: 0, fail: 0 });
+ const b = byPid.get(r.pid)!;
+ if (r.success) b.ok++; else b.fail++;
+}
+console.log(`\nPer-paragraph:`);
+for (const [pid, { ok, fail }] of byPid) {
+ console.log(` ${pid.slice(0, 8)}: ${ok} ok, ${fail} fail`);
+}
diff --git a/ts/scripts/judge-diag.ts b/ts/scripts/judge-diag.ts
new file mode 100644
index 0000000..a393ff2
--- /dev/null
+++ b/ts/scripts/judge-diag.ts
@@ -0,0 +1,70 @@
+/**
+ * Diagnostic: call GLM-5 on a failing paragraph, log raw response + headers.
+ * Usage: bun ts/scripts/judge-diag.ts [model-id]
+ */
+import { generateText, Output } from "ai";
+import { openrouter } from "../src/lib/openrouter.ts";
+import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { LabelOutputRaw } from "../src/schemas/label.ts";
+import { SYSTEM_PROMPT, buildJudgePrompt } from "../src/label/prompts.ts";
+
+const PID = process.argv[2];
+const MODEL = process.argv[3] ?? "z-ai/glm-5";
+if (!PID) { console.error("Usage: bun ts/scripts/judge-diag.ts [model-id]"); process.exit(1); }
+
+const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+const PARAGRAPHS_PATH = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
+
+interface S1Ann {
+ paragraphId: string;
+ label: { content_category: string; specificity_level: number; reasoning: string };
+ provenance: { modelId: string };
+}
+
+console.error(`Loading data for ${PID}...`);
+const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
+const s1anns = (allAnns as S1Ann[]).filter(a => a.paragraphId === PID);
+
+const { records: allParagraphs } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
+const paragraph = allParagraphs.find(p => p.id === PID);
+if (!paragraph) { console.error("Paragraph not found"); process.exit(1); }
+
+const priorAnns = s1anns.map(a => ({
+ content_category: a.label.content_category,
+ specificity_level: a.label.specificity_level,
+ reasoning: a.label.reasoning,
+}));
+
+const judgePrompt = buildJudgePrompt(paragraph, priorAnns);
+console.error(`\n=== JUDGE PROMPT (${judgePrompt.length} chars) ===\n`);
+console.error(judgePrompt.slice(0, 500) + "...\n");
+
+// ── Attempt 1: with structured output (like bench script) ──
+console.error("=== ATTEMPT WITH STRUCTURED OUTPUT ===");
+try {
+ const result = await generateText({
+ model: openrouter(MODEL),
+ output: Output.object({ schema: LabelOutputRaw }),
+ system: SYSTEM_PROMPT,
+ prompt: judgePrompt,
+ temperature: 0,
+ providerOptions: {
+ openrouter: {
+ reasoning: { effort: "medium" },
+ usage: { include: true },
+ },
+ },
+ abortSignal: AbortSignal.timeout(120_000),
+ });
+
+ console.log("SUCCESS (structured):");
+ console.log(" Output:", JSON.stringify(result.output, null, 2));
+ console.log(" Response ID:", result.response?.id);
+ console.log(" Model ID:", result.response?.modelId);
+ const hdrs = result.response?.headers;
+ console.log(" Headers:", JSON.stringify(hdrs && typeof hdrs === "object" ? hdrs : {}, null, 2));
+ console.log(" Provider metadata:", JSON.stringify(result.providerMetadata, null, 2));
+} catch (err) {
+ console.error("FAILED (structured):", err instanceof Error ? err.message.slice(0, 500) : String(err));
+}
diff --git a/ts/scripts/mimo-pilot.ts b/ts/scripts/mimo-pilot.ts
new file mode 100644
index 0000000..835e506
--- /dev/null
+++ b/ts/scripts/mimo-pilot.ts
@@ -0,0 +1,245 @@
+/**
+ * Run mimo-v2-flash on the same 500-sample pilot set used for prompt iteration.
+ * Compares against existing Stage 1 annotations to assess agreement.
+ *
+ * Usage: bun ts/scripts/mimo-pilot.ts
+ */
+import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
+import { PROMPT_VERSION } from "../src/label/prompts.ts";
+import { v4 as uuidv4 } from "uuid";
+import { existsSync } from "node:fs";
+import pLimit from "p-limit";
+
+const PILOT_SAMPLE = new URL("../../data/pilot/pilot-sample-v2.5.jsonl", import.meta.url).pathname;
+const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+const OUTPUT_PATH = new URL("../../data/pilot/pilot-mimo-flash.jsonl", import.meta.url).pathname;
+const MODEL = "xiaomi/mimo-v2-flash";
+const CONCURRENCY = 15;
+
+interface S1Ann {
+ paragraphId: string;
+ label: { content_category: string; specificity_level: number };
+ provenance: { modelId: string };
+}
+
+function pct(n: number, total: number): string {
+ return `${((n / total) * 100).toFixed(1)}%`;
+}
+
+async function main() {
+ // Load pilot sample paragraphs
+ console.error("Loading pilot sample paragraphs...");
+ const { records: paragraphs } = await readJsonl(PILOT_SAMPLE, Paragraph);
+ console.error(` ${paragraphs.length} paragraphs`);
+
+ const pilotIds = new Set(paragraphs.map(p => p.id));
+
+ // Load Stage 1 annotations for these paragraphs
+ console.error("Loading Stage 1 annotations for comparison...");
+ const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
+ const s1ByParagraph = new Map();
+ for (const raw of allAnns) {
+ const a = raw as S1Ann;
+ if (!pilotIds.has(a.paragraphId)) continue;
+ let arr = s1ByParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+ console.error(` ${s1ByParagraph.size} paragraphs with Stage 1 data`);
+
+ // Resume support
+ const doneKeys = new Set();
+ if (existsSync(OUTPUT_PATH)) {
+ const { records: existing } = await readJsonlRaw(OUTPUT_PATH);
+ for (const r of existing) {
+ const a = r as { paragraphId?: string };
+ if (a.paragraphId) doneKeys.add(a.paragraphId);
+ }
+ if (doneKeys.size > 0) console.error(` Resuming: ${doneKeys.size} already done`);
+ }
+
+ const remaining = paragraphs.filter(p => !doneKeys.has(p.id));
+ console.error(` Running ${remaining.length} annotations...\n`);
+
+ // Run mimo on remaining paragraphs
+ const runId = uuidv4();
+ const limit = pLimit(CONCURRENCY);
+ let completed = 0, failed = 0, totalCost = 0;
+ const startTime = Date.now();
+
+ const tasks = remaining.map(p => limit(async () => {
+ const opts: AnnotateOpts = {
+ modelId: MODEL,
+ stage: "benchmark",
+ runId,
+ promptVersion: PROMPT_VERSION,
+ reasoningEffort: "low",
+ };
+ try {
+ const ann = await annotateParagraph(p, opts);
+ await appendJsonl(OUTPUT_PATH, ann);
+ totalCost += ann.provenance.costUsd;
+ completed++;
+ if (completed % 50 === 0) {
+ const elapsed = (Date.now() - startTime) / 1000;
+ process.stderr.write(`\r ${completed}/${remaining.length} (${(completed / elapsed).toFixed(1)}/s, $${totalCost.toFixed(2)}) `);
+ }
+ } catch (err) {
+ failed++;
+ console.error(`\n ✖ ${p.id.slice(0, 8)}: ${err instanceof Error ? err.message : String(err)}`);
+ }
+ }));
+
+ await Promise.all(tasks);
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
+ console.error(`\n\n Done: ${completed} completed, ${failed} failed, $${totalCost.toFixed(2)}, ${elapsed}s\n`);
+
+ // ── Analysis ─────────────────────────────────────────────────────────
+ // Load all mimo results (including resumed)
+ const { records: mimoRaw } = await readJsonlRaw(OUTPUT_PATH);
+ const mimoByParagraph = new Map();
+ for (const r of mimoRaw) {
+ const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number } };
+ mimoByParagraph.set(a.paragraphId, a.label);
+ }
+
+ const s1Models = ["google/gemini-3.1-flash-lite-preview", "openai/gpt-5.4-nano", "x-ai/grok-4.1-fast"];
+ const shortName = (m: string) => m.split("/").pop()!;
+
+ console.log("═══════════════════════════════════════════════════════════");
+ console.log(" MIMO-V2-FLASH PILOT COMPARISON (n=" + mimoByParagraph.size + ")");
+ console.log("═══════════════════════════════════════════════════════════\n");
+
+ // Pairwise agreement: mimo vs each Stage 1 model
+ console.log("── Pairwise Agreement (mimo vs Stage 1 models) ─────────────");
+ for (const model of s1Models) {
+ let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
+ for (const [pid, mimoLabel] of mimoByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns) continue;
+ const s1ann = s1anns.find(a => a.provenance.modelId === model);
+ if (!s1ann) continue;
+ total++;
+ if (s1ann.label.content_category === mimoLabel.content_category) catAgree++;
+ if (s1ann.label.specificity_level === mimoLabel.specificity_level) specAgree++;
+ if (s1ann.label.content_category === mimoLabel.content_category &&
+ s1ann.label.specificity_level === mimoLabel.specificity_level) bothAgree++;
+ }
+ console.log(`\n mimo × ${shortName(model)} (n=${total}):`);
+ console.log(` Category: ${pct(catAgree, total)} (${catAgree})`);
+ console.log(` Specificity: ${pct(specAgree, total)} (${specAgree})`);
+ console.log(` Both: ${pct(bothAgree, total)} (${bothAgree})`);
+ }
+
+ // Agreement with majority vote
+ console.log("\n── Agreement with Stage 1 Majority Vote ───────────────────");
+ let catMajAgree = 0, specMajAgree = 0, bothMajAgree = 0, totalMaj = 0;
+ for (const [pid, mimoLabel] of mimoByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+ totalMaj++;
+
+ // Category majority
+ const cats = s1anns.map(a => a.label.content_category);
+ const catFreq = new Map();
+ for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
+ const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ // Specificity majority
+ const specs = s1anns.map(a => a.label.specificity_level);
+ const specFreq = new Map();
+ for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
+ const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
+
+ const catOk = majCat !== undefined && mimoLabel.content_category === majCat;
+ const specOk = majSpec !== undefined && mimoLabel.specificity_level === majSpec;
+ if (catOk) catMajAgree++;
+ if (specOk) specMajAgree++;
+ if (catOk && specOk) bothMajAgree++;
+ }
+ console.log(` mimo vs majority (n=${totalMaj}):`);
+ console.log(` Category: ${pct(catMajAgree, totalMaj)} (${catMajAgree})`);
+ console.log(` Specificity: ${pct(specMajAgree, totalMaj)} (${specMajAgree})`);
+ console.log(` Both: ${pct(bothMajAgree, totalMaj)} (${bothMajAgree})`);
+
+ // Unanimity: if mimo replaced nano, what would the new unanimity be?
+ console.log("\n── Hypothetical: replace nano with mimo ────────────────────");
+ let newCatUnan = 0, newSpecUnan = 0, newBothUnan = 0;
+ let oldCatUnan = 0, oldSpecUnan = 0, oldBothUnan = 0;
+ let nCompare = 0;
+
+ for (const [pid, mimoLabel] of mimoByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+ nCompare++;
+
+ const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
+ const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
+ const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
+
+ // Old (with nano)
+ const oldCats = [gemini, nano, grok].map(a => a.label.content_category);
+ const oldSpecs = [gemini, nano, grok].map(a => a.label.specificity_level);
+ const oldCU = new Set(oldCats).size === 1;
+ const oldSU = new Set(oldSpecs).size === 1;
+ if (oldCU) oldCatUnan++;
+ if (oldSU) oldSpecUnan++;
+ if (oldCU && oldSU) oldBothUnan++;
+
+ // New (with mimo replacing nano)
+ const newCats = [gemini.label.content_category, mimoLabel.content_category, grok.label.content_category];
+ const newSpecs = [gemini.label.specificity_level, mimoLabel.specificity_level, grok.label.specificity_level];
+ const newCU = new Set(newCats).size === 1;
+ const newSU = new Set(newSpecs).size === 1;
+ if (newCU) newCatUnan++;
+ if (newSU) newSpecUnan++;
+ if (newCU && newSU) newBothUnan++;
+ }
+
+ console.log(` n=${nCompare}`);
+ console.log(` Old (nano) New (mimo) Delta`);
+ console.log(` Category: ${pct(oldCatUnan, nCompare).padStart(6)} ${pct(newCatUnan, nCompare).padStart(6)} ${((newCatUnan - oldCatUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
+ console.log(` Specificity: ${pct(oldSpecUnan, nCompare).padStart(6)} ${pct(newSpecUnan, nCompare).padStart(6)} ${((newSpecUnan - oldSpecUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
+ console.log(` Both: ${pct(oldBothUnan, nCompare).padStart(6)} ${pct(newBothUnan, nCompare).padStart(6)} ${((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
+
+ // Outlier analysis
+ console.log("\n── Outlier Rate (mimo in 3-model panel) ────────────────────");
+ let mimoCatOut = 0, mimoSpecOut = 0;
+ for (const [pid, mimoLabel] of mimoByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+
+ const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
+ const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
+
+ // mimo is outlier when gemini and grok agree but mimo differs
+ if (gemini.label.content_category === grok.label.content_category &&
+ mimoLabel.content_category !== gemini.label.content_category) mimoCatOut++;
+ if (gemini.label.specificity_level === grok.label.specificity_level &&
+ mimoLabel.specificity_level !== gemini.label.specificity_level) mimoSpecOut++;
+ }
+ console.log(` When gemini×grok agree, mimo disagrees:`);
+ console.log(` Category: ${mimoCatOut} (${pct(mimoCatOut, nCompare)})`);
+ console.log(` Specificity: ${mimoSpecOut} (${pct(mimoSpecOut, nCompare)})`);
+
+ // For comparison: nano outlier rate on same paragraphs
+ let nanoCatOut = 0, nanoSpecOut = 0;
+ for (const [pid] of mimoByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+ const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
+ const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
+ const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
+ if (gemini.label.content_category === grok.label.content_category &&
+ nano.label.content_category !== gemini.label.content_category) nanoCatOut++;
+ if (gemini.label.specificity_level === grok.label.specificity_level &&
+ nano.label.specificity_level !== gemini.label.specificity_level) nanoSpecOut++;
+ }
+ console.log(`\n For comparison, nano disagrees when gemini×grok agree:`);
+ console.log(` Category: ${nanoCatOut} (${pct(nanoCatOut, nCompare)})`);
+ console.log(` Specificity: ${nanoSpecOut} (${pct(nanoSpecOut, nCompare)})`);
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/mimo-raw-test.ts b/ts/scripts/mimo-raw-test.ts
new file mode 100644
index 0000000..8d327f1
--- /dev/null
+++ b/ts/scripts/mimo-raw-test.ts
@@ -0,0 +1,83 @@
+/**
+ * Raw test: call mimo-v2-flash on 20 paragraphs without Output.object,
+ * then validate each response against the schema to find failure patterns.
+ */
+import { generateText } from "ai";
+import { openrouter } from "../src/lib/openrouter.ts";
+import { readJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { SYSTEM_PROMPT, buildUserPrompt } from "../src/label/prompts.ts";
+import { LabelOutputRaw } from "../src/schemas/label.ts";
+
+const INPUT = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
+const MODEL = "xiaomi/mimo-v2-flash";
+const N = 20;
+
+async function main() {
+ const { records: paragraphs } = await readJsonl(INPUT, Paragraph);
+ const sample = paragraphs.slice(0, N);
+
+ let pass = 0, fail = 0;
+ const failures: { id: string; issues: unknown; raw: unknown }[] = [];
+
+ for (const p of sample) {
+ try {
+ const result = await generateText({
+ model: openrouter(MODEL),
+ system: SYSTEM_PROMPT,
+ prompt: buildUserPrompt(p),
+ temperature: 0,
+ providerOptions: {
+ openrouter: {
+ reasoning: { effort: "low" },
+ usage: { include: true },
+ },
+ },
+ abortSignal: AbortSignal.timeout(120_000),
+ });
+
+ // Try to parse as JSON
+ let parsed: unknown;
+ try {
+ parsed = JSON.parse(result.text);
+ } catch {
+ // Try extracting JSON from markdown/text
+ const match = result.text.match(/\{[\s\S]*\}/);
+ if (match) parsed = JSON.parse(match[0]);
+ else {
+ failures.push({ id: p.id, issues: "Not JSON", raw: result.text.slice(0, 500) });
+ fail++;
+ continue;
+ }
+ }
+
+ const validation = LabelOutputRaw.safeParse(parsed);
+ if (validation.success) {
+ pass++;
+ process.stdout.write(".");
+ } else {
+ fail++;
+ failures.push({ id: p.id, issues: validation.error.issues, raw: parsed });
+ process.stdout.write("✖");
+ }
+ } catch (err) {
+ fail++;
+ failures.push({ id: p.id, issues: err instanceof Error ? err.message : String(err), raw: null });
+ process.stdout.write("E");
+ }
+ }
+
+ console.log(`\n\n${pass}/${N} passed, ${fail} failed\n`);
+
+ if (failures.length > 0) {
+ console.log("=== FAILURES ===\n");
+ for (const f of failures) {
+ console.log(`--- ${f.id} ---`);
+ console.log("Issues:", JSON.stringify(f.issues, null, 2));
+ console.log("Raw:", JSON.stringify(f.raw, null, 2)?.slice(0, 1000));
+ console.log();
+ }
+ }
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/mimo-test.ts b/ts/scripts/mimo-test.ts
new file mode 100644
index 0000000..633570c
--- /dev/null
+++ b/ts/scripts/mimo-test.ts
@@ -0,0 +1,44 @@
+/**
+ * Quick smoke test: run mimo-v2-flash on 10 paragraphs to see if it works.
+ * Usage: bun ts/scripts/mimo-test.ts
+ */
+import { readJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { annotateParagraph } from "../src/label/annotate.ts";
+import { v4 as uuidv4 } from "uuid";
+
+const INPUT = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
+const MODEL = "xiaomi/mimo-v2-flash";
+const N = 10;
+
+async function main() {
+ const { records: paragraphs } = await readJsonl(INPUT, Paragraph);
+ const sample = paragraphs.slice(0, N);
+ const runId = uuidv4();
+
+ console.log(`Testing ${MODEL} on ${N} paragraphs...\n`);
+
+ let success = 0;
+ let failed = 0;
+
+ for (const p of sample) {
+ try {
+ const ann = await annotateParagraph(p, {
+ modelId: MODEL,
+ stage: "benchmark",
+ runId,
+ reasoningEffort: "low",
+ });
+ success++;
+ console.log(`✓ ${p.id.slice(0, 8)} → ${ann.label.content_category} / ${ann.label.specificity_level} (${ann.provenance.outputTokens} out, ${ann.provenance.reasoningTokens} reasoning, ${ann.provenance.latencyMs}ms, $${ann.provenance.costUsd.toFixed(5)})`);
+ } catch (err) {
+ failed++;
+ const msg = err instanceof Error ? err.message : String(err);
+ console.log(`✖ ${p.id.slice(0, 8)} → ${msg}`);
+ }
+ }
+
+ console.log(`\n${success}/${N} succeeded, ${failed} failed`);
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/model-bench.ts b/ts/scripts/model-bench.ts
new file mode 100644
index 0000000..d1d541b
--- /dev/null
+++ b/ts/scripts/model-bench.ts
@@ -0,0 +1,259 @@
+/**
+ * Benchmark a single model on the 500-sample pilot set.
+ * Outputs JSONL + comparison report against Stage 1 annotations.
+ *
+ * Usage: bun ts/scripts/model-bench.ts [--smoke] [--concurrency 15]
+ *
+ * --smoke: run only 5 paragraphs to check schema compliance
+ */
+import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
+import { PROMPT_VERSION } from "../src/label/prompts.ts";
+import { v4 as uuidv4 } from "uuid";
+import { existsSync } from "node:fs";
+import pLimit from "p-limit";
+
+const args = process.argv.slice(2);
+const MODEL = args.find(a => !a.startsWith("--"))!;
+if (!MODEL) { console.error("Usage: bun ts/scripts/model-bench.ts [--smoke]"); process.exit(1); }
+
+const SMOKE = args.includes("--smoke");
+const concIdx = args.indexOf("--concurrency");
+const CONCURRENCY = concIdx !== -1 ? parseInt(args[concIdx + 1], 10) : 15;
+
+const PILOT_SAMPLE = new URL("../../data/pilot/pilot-sample-v2.5.jsonl", import.meta.url).pathname;
+const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+const slug = MODEL.replace("/", "_");
+const OUTPUT_PATH = new URL(`../../data/bench/${slug}.jsonl`, import.meta.url).pathname;
+
+import { mkdir } from "node:fs/promises";
+const benchDir = new URL("../../data/bench", import.meta.url).pathname;
+if (!existsSync(benchDir)) await mkdir(benchDir, { recursive: true });
+
+interface S1Ann {
+ paragraphId: string;
+ label: { content_category: string; specificity_level: number };
+ provenance: { modelId: string };
+}
+
+function pct(n: number, total: number): string {
+ return `${((n / total) * 100).toFixed(1)}%`;
+}
+
+async function main() {
+ const shortName = MODEL.split("/").pop()!;
+ console.error(`\n[${shortName}] Loading data...`);
+
+ const { records: allParagraphs } = await readJsonl(PILOT_SAMPLE, Paragraph);
+ const paragraphs = SMOKE ? allParagraphs.slice(0, 5) : allParagraphs;
+ console.error(`[${shortName}] ${paragraphs.length} paragraphs ${SMOKE ? "(smoke test)" : ""}`);
+
+ // Resume support
+ const doneKeys = new Set();
+ if (existsSync(OUTPUT_PATH)) {
+ const { records: existing } = await readJsonlRaw(OUTPUT_PATH);
+ for (const r of existing) {
+ const a = r as { paragraphId?: string };
+ if (a.paragraphId) doneKeys.add(a.paragraphId);
+ }
+ if (doneKeys.size > 0) console.error(`[${shortName}] Resuming: ${doneKeys.size} already done`);
+ }
+
+ const remaining = paragraphs.filter(p => !doneKeys.has(p.id));
+ if (remaining.length === 0) {
+ console.error(`[${shortName}] All done, skipping to analysis`);
+ } else {
+ console.error(`[${shortName}] Running ${remaining.length} annotations (concurrency=${CONCURRENCY})...\n`);
+
+ const runId = uuidv4();
+ const limit = pLimit(CONCURRENCY);
+ let completed = 0, failed = 0, totalCost = 0;
+ const errors: { id: string; msg: string }[] = [];
+ const startTime = Date.now();
+
+ const tasks = remaining.map(p => limit(async () => {
+ const opts: AnnotateOpts = {
+ modelId: MODEL,
+ stage: "benchmark",
+ runId,
+ promptVersion: PROMPT_VERSION,
+ reasoningEffort: "low",
+ };
+ try {
+ const ann = await annotateParagraph(p, opts);
+ await appendJsonl(OUTPUT_PATH, ann);
+ totalCost += ann.provenance.costUsd;
+ completed++;
+ if (completed % 50 === 0 || SMOKE) {
+ const elapsed = (Date.now() - startTime) / 1000;
+ process.stderr.write(`\r[${shortName}] ${completed}/${remaining.length} (${(completed / elapsed).toFixed(1)}/s, $${totalCost.toFixed(4)}) `);
+ }
+ } catch (err) {
+ failed++;
+ const msg = err instanceof Error ? err.message : String(err);
+ errors.push({ id: p.id.slice(0, 8), msg: msg.slice(0, 200) });
+ if (SMOKE || failed <= 5) {
+ console.error(`\n[${shortName}] ✖ ${p.id.slice(0, 8)}: ${msg.slice(0, 300)}`);
+ }
+ }
+ }));
+
+ await Promise.all(tasks);
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
+ console.error(`\n[${shortName}] Done: ${completed} ok, ${failed} failed, $${totalCost.toFixed(4)}, ${elapsed}s`);
+
+ if (errors.length > 5) {
+ console.error(`[${shortName}] ... and ${errors.length - 5} more errors`);
+ }
+
+ if (SMOKE) {
+ console.error(`\n[${shortName}] Smoke test complete.`);
+ return;
+ }
+ }
+
+ // ── Analysis ─────────────────────────────────────────────────────────
+ if (SMOKE) return;
+
+ const pilotIds = new Set(paragraphs.map(p => p.id));
+
+ console.error(`[${shortName}] Loading Stage 1 data for comparison...`);
+ const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
+ const s1ByParagraph = new Map();
+ for (const raw of allAnns) {
+ const a = raw as S1Ann;
+ if (!pilotIds.has(a.paragraphId)) continue;
+ let arr = s1ByParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+
+ const { records: benchRaw } = await readJsonlRaw(OUTPUT_PATH);
+ const benchByParagraph = new Map();
+ for (const r of benchRaw) {
+ const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number }; provenance: { costUsd: number; latencyMs: number; outputTokens: number; reasoningTokens: number } };
+ benchByParagraph.set(a.paragraphId, { ...a.label, costUsd: a.provenance.costUsd, latencyMs: a.provenance.latencyMs, outputTokens: a.provenance.outputTokens, reasoningTokens: a.provenance.reasoningTokens });
+ }
+
+ const n = benchByParagraph.size;
+ const s1Models = ["google/gemini-3.1-flash-lite-preview", "openai/gpt-5.4-nano", "x-ai/grok-4.1-fast"];
+ const sn = (m: string) => m.split("/").pop()!;
+
+ let totalCost = 0, totalLatency = 0, totalOutput = 0, totalReasoning = 0;
+ for (const v of benchByParagraph.values()) {
+ totalCost += v.costUsd;
+ totalLatency += v.latencyMs;
+ totalOutput += v.outputTokens;
+ totalReasoning += v.reasoningTokens;
+ }
+
+ // Output structured JSON report for aggregation
+ const report: Record = {
+ model: MODEL,
+ shortName,
+ n,
+ totalCost: +totalCost.toFixed(4),
+ avgCost: +(totalCost / n).toFixed(6),
+ avgLatencyMs: +(totalLatency / n).toFixed(0),
+ avgOutputTokens: +(totalOutput / n).toFixed(0),
+ avgReasoningTokens: +(totalReasoning / n).toFixed(0),
+ pairwise: {} as Record,
+ };
+
+ console.log(`\n═══ ${shortName} (n=${n}) ═══`);
+ console.log(` Cost: $${totalCost.toFixed(4)} total, $${(totalCost / n).toFixed(6)}/ann`);
+ console.log(` Latency: ${(totalLatency / n).toFixed(0)}ms avg`);
+ console.log(` Output: ${(totalOutput / n).toFixed(0)} tokens avg, ${(totalReasoning / n).toFixed(0)} reasoning avg`);
+
+ // Pairwise
+ console.log("\n Pairwise vs Stage 1:");
+ for (const model of s1Models) {
+ let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
+ for (const [pid, bl] of benchByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns) continue;
+ const s1 = s1anns.find(a => a.provenance.modelId === model);
+ if (!s1) continue;
+ total++;
+ if (s1.label.content_category === bl.content_category) catAgree++;
+ if (s1.label.specificity_level === bl.specificity_level) specAgree++;
+ if (s1.label.content_category === bl.content_category && s1.label.specificity_level === bl.specificity_level) bothAgree++;
+ }
+ (report.pairwise as Record)[sn(model)] = { cat: +(catAgree / total * 100).toFixed(1), spec: +(specAgree / total * 100).toFixed(1), both: +(bothAgree / total * 100).toFixed(1) };
+ console.log(` × ${sn(model).padEnd(30)} cat ${pct(catAgree, total).padStart(6)} spec ${pct(specAgree, total).padStart(6)} both ${pct(bothAgree, total).padStart(6)}`);
+ }
+
+ // Majority agreement
+ let catMajAgree = 0, specMajAgree = 0, bothMajAgree = 0, totalMaj = 0;
+ for (const [pid, bl] of benchByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+ totalMaj++;
+ const cats = s1anns.map(a => a.label.content_category);
+ const catFreq = new Map();
+ for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
+ const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
+ const specs = s1anns.map(a => a.label.specificity_level);
+ const specFreq = new Map();
+ for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
+ const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
+ if (majCat && bl.content_category === majCat) catMajAgree++;
+ if (majSpec !== undefined && bl.specificity_level === majSpec) specMajAgree++;
+ if (majCat && bl.content_category === majCat && majSpec !== undefined && bl.specificity_level === majSpec) bothMajAgree++;
+ }
+ report.vsMajority = { cat: +(catMajAgree / totalMaj * 100).toFixed(1), spec: +(specMajAgree / totalMaj * 100).toFixed(1), both: +(bothMajAgree / totalMaj * 100).toFixed(1) };
+ console.log(`\n vs Majority Vote: cat ${pct(catMajAgree, totalMaj).padStart(6)} spec ${pct(specMajAgree, totalMaj).padStart(6)} both ${pct(bothMajAgree, totalMaj).padStart(6)}`);
+
+ // Hypothetical replacement of nano
+ let newCatUnan = 0, newSpecUnan = 0, newBothUnan = 0;
+ let oldCatUnan = 0, oldSpecUnan = 0, oldBothUnan = 0;
+ let nCompare = 0;
+ for (const [pid, bl] of benchByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+ nCompare++;
+ const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
+ const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
+ const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
+
+ const oldCats = [gemini, nano, grok].map(a => a.label.content_category);
+ const oldSpecs = [gemini, nano, grok].map(a => a.label.specificity_level);
+ if (new Set(oldCats).size === 1) oldCatUnan++;
+ if (new Set(oldSpecs).size === 1) oldSpecUnan++;
+ if (new Set(oldCats).size === 1 && new Set(oldSpecs).size === 1) oldBothUnan++;
+
+ const newCats = [gemini.label.content_category, bl.content_category, grok.label.content_category];
+ const newSpecs = [gemini.label.specificity_level, bl.specificity_level, grok.label.specificity_level];
+ if (new Set(newCats).size === 1) newCatUnan++;
+ if (new Set(newSpecs).size === 1) newSpecUnan++;
+ if (new Set(newCats).size === 1 && new Set(newSpecs).size === 1) newBothUnan++;
+ }
+ report.replaceNano = {
+ oldBothUnan: +(oldBothUnan / nCompare * 100).toFixed(1),
+ newBothUnan: +(newBothUnan / nCompare * 100).toFixed(1),
+ deltaBothPp: +((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1),
+ };
+ console.log(`\n Replace nano hypothetical (n=${nCompare}):`);
+ console.log(` Both-unan: ${pct(oldBothUnan, nCompare)} → ${pct(newBothUnan, nCompare)} (${((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1)}pp)`);
+
+ // Outlier rate vs gemini×grok
+ let benchCatOut = 0, benchSpecOut = 0;
+ for (const [pid, bl] of benchByParagraph) {
+ const s1anns = s1ByParagraph.get(pid);
+ if (!s1anns || s1anns.length !== 3) continue;
+ const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
+ const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
+ if (gemini.label.content_category === grok.label.content_category && bl.content_category !== gemini.label.content_category) benchCatOut++;
+ if (gemini.label.specificity_level === grok.label.specificity_level && bl.specificity_level !== gemini.label.specificity_level) benchSpecOut++;
+ }
+ report.outlierVsGeminiGrok = { cat: +(benchCatOut / nCompare * 100).toFixed(1), spec: +(benchSpecOut / nCompare * 100).toFixed(1) };
+ console.log(`\n Outlier (gemini×grok agree, ${shortName} differs): cat ${pct(benchCatOut, nCompare)}, spec ${pct(benchSpecOut, nCompare)}`);
+
+ // Write report JSON
+ const reportPath = new URL(`../../data/bench/${slug}.report.json`, import.meta.url).pathname;
+ await Bun.write(reportPath, JSON.stringify(report, null, 2) + "\n");
+ console.error(`\n[${shortName}] Report saved to ${reportPath}`);
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/model-bias-analysis.ts b/ts/scripts/model-bias-analysis.ts
new file mode 100644
index 0000000..32c0544
--- /dev/null
+++ b/ts/scripts/model-bias-analysis.ts
@@ -0,0 +1,470 @@
+/**
+ * Model bias analysis for Stage 1 annotations.
+ * Identifies which model is the outlier most often, systematic biases,
+ * pairwise agreement, and category-specific dispute patterns.
+ *
+ * Usage: bun ts/scripts/model-bias-analysis.ts
+ */
+import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+
+const PARAGRAPHS_PATH = new URL(
+ "../../data/paragraphs/paragraphs-clean.jsonl",
+ import.meta.url,
+).pathname;
+const ANNOTATIONS_PATH = new URL(
+ "../../data/annotations/stage1.jsonl",
+ import.meta.url,
+).pathname;
+
+const MODELS = [
+ "google/gemini-3.1-flash-lite-preview",
+ "x-ai/grok-4.1-fast",
+ "xiaomi/mimo-v2-flash",
+] as const;
+type ModelId = (typeof MODELS)[number];
+
+const SHORT: Record = {
+ "google/gemini-3.1-flash-lite-preview": "Gemini",
+ "x-ai/grok-4.1-fast": "Grok",
+ "xiaomi/mimo-v2-flash": "Mimo",
+};
+
+interface Ann {
+ paragraphId: string;
+ label: {
+ content_category: string;
+ specificity_level: number;
+ category_confidence: string;
+ specificity_confidence: string;
+ reasoning: string;
+ };
+ provenance: {
+ modelId: string;
+ costUsd: number;
+ inputTokens: number;
+ outputTokens: number;
+ reasoningTokens: number;
+ latencyMs: number;
+ requestedAt: string;
+ };
+}
+
+// ── Helpers ──────────────────────────────────────────────────────────────
+
+function pct(n: number, total: number): string {
+ if (total === 0) return "0.0%";
+ return (100 * n / total).toFixed(1) + "%";
+}
+
+function padRight(s: string, len: number): string {
+ return s.length >= len ? s : s + " ".repeat(len - s.length);
+}
+
+function padLeft(s: string, len: number): string {
+ return s.length >= len ? s : " ".repeat(len - s.length) + s;
+}
+
+function printTable(headers: string[], rows: string[][], colWidths?: number[]) {
+ const widths =
+ colWidths ??
+ headers.map((h, i) =>
+ Math.max(h.length, ...rows.map((r) => (r[i] ?? "").length)),
+ );
+ const headerLine = headers.map((h, i) => padRight(h, widths[i])).join(" ");
+ const sep = widths.map((w) => "-".repeat(w)).join(" ");
+ console.log(headerLine);
+ console.log(sep);
+ for (const row of rows) {
+ console.log(row.map((c, i) => padRight(c, widths[i])).join(" "));
+ }
+}
+
+// ── Load data ────────────────────────────────────────────────────────────
+
+console.log("Loading data...");
+const [{ records: paragraphs, skipped: pSkip }, { records: rawAnns, skipped: aSkip }] =
+ await Promise.all([
+ readJsonl(PARAGRAPHS_PATH, Paragraph),
+ readJsonlRaw(ANNOTATIONS_PATH),
+ ]);
+const annotations = rawAnns as Ann[];
+
+console.log(
+ `Loaded ${paragraphs.length} paragraphs (${pSkip} skipped), ${annotations.length} annotations (${aSkip} skipped)\n`,
+);
+
+// ── Group annotations by paragraphId ─────────────────────────────────────
+
+const byParagraph = new Map>();
+for (const ann of annotations) {
+ const mid = ann.provenance.modelId as ModelId;
+ if (!MODELS.includes(mid)) continue;
+ if (!byParagraph.has(ann.paragraphId)) byParagraph.set(ann.paragraphId, new Map());
+ byParagraph.get(ann.paragraphId)!.set(mid, ann);
+}
+
+// Only keep paragraphs with all 3 models
+const complete = new Map>();
+for (const [pid, models] of byParagraph) {
+ if (models.size === 3) complete.set(pid, models);
+}
+console.log(`Paragraphs with all 3 models: ${complete.size}\n`);
+
+// ── 1. Outlier Analysis ──────────────────────────────────────────────────
+
+console.log("=" .repeat(70));
+console.log("1. OUTLIER ANALYSIS");
+console.log("=" .repeat(70));
+
+const catOutlierCount: Record = {
+ "google/gemini-3.1-flash-lite-preview": 0,
+ "x-ai/grok-4.1-fast": 0,
+ "xiaomi/mimo-v2-flash": 0,
+};
+const specOutlierCount: Record = { ...catOutlierCount };
+// Reset specOutlierCount independently
+for (const m of MODELS) specOutlierCount[m] = 0;
+
+let catDisagree = 0;
+let specDisagree = 0;
+let catUnanimous = 0;
+let specUnanimous = 0;
+let threeWayDisagreeCat = 0;
+let threeWayDisagreeSpec = 0;
+
+for (const [, models] of complete) {
+ const cats = MODELS.map((m) => models.get(m)!.label.content_category);
+ const specs = MODELS.map((m) => models.get(m)!.label.specificity_level);
+
+ // Category outlier
+ if (cats[0] === cats[1] && cats[1] === cats[2]) {
+ catUnanimous++;
+ } else if (cats[0] === cats[1] && cats[2] !== cats[0]) {
+ catDisagree++;
+ catOutlierCount[MODELS[2]]++;
+ } else if (cats[0] === cats[2] && cats[1] !== cats[0]) {
+ catDisagree++;
+ catOutlierCount[MODELS[1]]++;
+ } else if (cats[1] === cats[2] && cats[0] !== cats[1]) {
+ catDisagree++;
+ catOutlierCount[MODELS[0]]++;
+ } else {
+ threeWayDisagreeCat++;
+ }
+
+ // Specificity outlier
+ if (specs[0] === specs[1] && specs[1] === specs[2]) {
+ specUnanimous++;
+ } else if (specs[0] === specs[1] && specs[2] !== specs[0]) {
+ specDisagree++;
+ specOutlierCount[MODELS[2]]++;
+ } else if (specs[0] === specs[2] && specs[1] !== specs[0]) {
+ specDisagree++;
+ specOutlierCount[MODELS[1]]++;
+ } else if (specs[1] === specs[2] && specs[0] !== specs[1]) {
+ specDisagree++;
+ specOutlierCount[MODELS[0]]++;
+ } else {
+ threeWayDisagreeSpec++;
+ }
+}
+
+console.log(`\nCategory: ${catUnanimous} unanimous, ${catDisagree} 2v1, ${threeWayDisagreeCat} three-way disagree`);
+console.log("\nCategory outlier counts (when one model disagrees with the other two):");
+printTable(
+ ["Model", "Outlier Count", "% of 2v1"],
+ MODELS.map((m) => [SHORT[m], String(catOutlierCount[m]), pct(catOutlierCount[m], catDisagree)]),
+);
+
+console.log(`\nSpecificity: ${specUnanimous} unanimous, ${specDisagree} 2v1, ${threeWayDisagreeSpec} three-way disagree`);
+console.log("\nSpecificity outlier counts:");
+printTable(
+ ["Model", "Outlier Count", "% of 2v1"],
+ MODELS.map((m) => [SHORT[m], String(specOutlierCount[m]), pct(specOutlierCount[m], specDisagree)]),
+);
+
+// ── 2. Category Bias ─────────────────────────────────────────────────────
+
+console.log("\n" + "=" .repeat(70));
+console.log("2. CATEGORY BIAS");
+console.log("=" .repeat(70));
+
+const allCategories = new Set();
+const catCounts: Record> = {} as any;
+for (const m of MODELS) catCounts[m] = {};
+
+for (const ann of annotations) {
+ const mid = ann.provenance.modelId as ModelId;
+ if (!MODELS.includes(mid)) continue;
+ const cat = ann.label.content_category;
+ allCategories.add(cat);
+ catCounts[mid][cat] = (catCounts[mid][cat] ?? 0) + 1;
+}
+
+const categories = [...allCategories].sort();
+const modelTotals: Record = {} as any;
+for (const m of MODELS) {
+ modelTotals[m] = Object.values(catCounts[m]).reduce((a, b) => a + b, 0);
+}
+
+console.log("\nCategory distribution (% of each model's annotations):\n");
+const catHeaders = ["Category", ...MODELS.map((m) => SHORT[m]), "Average"];
+const catRows: string[][] = [];
+for (const cat of categories) {
+ const pcts = MODELS.map((m) => (100 * (catCounts[m][cat] ?? 0)) / modelTotals[m]);
+ const avg = pcts.reduce((a, b) => a + b, 0) / 3;
+ catRows.push([cat, ...pcts.map((p) => p.toFixed(1) + "%"), avg.toFixed(1) + "%"]);
+}
+printTable(catHeaders, catRows);
+
+console.log("\nOver/under-indexing vs average (percentage points):\n");
+const biasHeaders = ["Category", ...MODELS.map((m) => SHORT[m])];
+const biasRows: string[][] = [];
+for (const cat of categories) {
+ const pcts = MODELS.map((m) => (100 * (catCounts[m][cat] ?? 0)) / modelTotals[m]);
+ const avg = pcts.reduce((a, b) => a + b, 0) / 3;
+ biasRows.push([
+ cat,
+ ...pcts.map((p) => {
+ const diff = p - avg;
+ const sign = diff >= 0 ? "+" : "";
+ return sign + diff.toFixed(1) + "pp";
+ }),
+ ]);
+}
+printTable(biasHeaders, biasRows);
+
+// ── 3. Specificity Bias ──────────────────────────────────────────────────
+
+console.log("\n" + "=" .repeat(70));
+console.log("3. SPECIFICITY BIAS");
+console.log("=" .repeat(70));
+
+const specCounts: Record> = {} as any;
+for (const m of MODELS) specCounts[m] = {};
+
+for (const ann of annotations) {
+ const mid = ann.provenance.modelId as ModelId;
+ if (!MODELS.includes(mid)) continue;
+ const spec = ann.label.specificity_level;
+ specCounts[mid][spec] = (specCounts[mid][spec] ?? 0) + 1;
+}
+
+const specLevels = [1, 2, 3, 4];
+console.log("\nSpecificity distribution (% of each model's annotations):\n");
+const specHeaders = ["Spec Level", ...MODELS.map((m) => SHORT[m]), "Average"];
+const specRows: string[][] = [];
+for (const lvl of specLevels) {
+ const pcts = MODELS.map((m) => (100 * (specCounts[m][lvl] ?? 0)) / modelTotals[m]);
+ const avg = pcts.reduce((a, b) => a + b, 0) / 3;
+ specRows.push([
+ String(lvl),
+ ...pcts.map((p) => p.toFixed(1) + "%"),
+ avg.toFixed(1) + "%",
+ ]);
+}
+printTable(specHeaders, specRows);
+
+console.log("\nOver/under-indexing vs average (percentage points):\n");
+const specBiasRows: string[][] = [];
+for (const lvl of specLevels) {
+ const pcts = MODELS.map((m) => (100 * (specCounts[m][lvl] ?? 0)) / modelTotals[m]);
+ const avg = pcts.reduce((a, b) => a + b, 0) / 3;
+ specBiasRows.push([
+ String(lvl),
+ ...pcts.map((p) => {
+ const diff = p - avg;
+ const sign = diff >= 0 ? "+" : "";
+ return sign + diff.toFixed(1) + "pp";
+ }),
+ ]);
+}
+printTable(["Spec Level", ...MODELS.map((m) => SHORT[m])], specBiasRows);
+
+// Mean specificity per model
+console.log("\nMean specificity per model:");
+for (const m of MODELS) {
+ let sum = 0;
+ let count = 0;
+ for (const [lvl, n] of Object.entries(specCounts[m])) {
+ sum += Number(lvl) * n;
+ count += n;
+ }
+ console.log(` ${SHORT[m]}: ${(sum / count).toFixed(3)}`);
+}
+
+// ── 4. Pairwise Agreement ────────────────────────────────────────────────
+
+console.log("\n" + "=" .repeat(70));
+console.log("4. PAIRWISE AGREEMENT");
+console.log("=" .repeat(70));
+
+const pairs: [ModelId, ModelId][] = [
+ [MODELS[0], MODELS[1]],
+ [MODELS[0], MODELS[2]],
+ [MODELS[1], MODELS[2]],
+];
+
+console.log("");
+const pairHeaders = ["Pair", "Cat Agree", "Cat %", "Spec Agree", "Spec %", "Both Agree", "Both %"];
+const pairRows: string[][] = [];
+for (const [a, b] of pairs) {
+ let catAgree = 0;
+ let specAgree = 0;
+ let bothAgree = 0;
+ let total = 0;
+ for (const [, models] of complete) {
+ const annA = models.get(a)!;
+ const annB = models.get(b)!;
+ total++;
+ const cMatch = annA.label.content_category === annB.label.content_category;
+ const sMatch = annA.label.specificity_level === annB.label.specificity_level;
+ if (cMatch) catAgree++;
+ if (sMatch) specAgree++;
+ if (cMatch && sMatch) bothAgree++;
+ }
+ pairRows.push([
+ `${SHORT[a]} - ${SHORT[b]}`,
+ String(catAgree),
+ pct(catAgree, total),
+ String(specAgree),
+ pct(specAgree, total),
+ String(bothAgree),
+ pct(bothAgree, total),
+ ]);
+}
+printTable(pairHeaders, pairRows);
+
+// ── 5. Conditional Outlier ───────────────────────────────────────────────
+
+console.log("\n" + "=" .repeat(70));
+console.log("5. CONDITIONAL OUTLIER: What does the outlier model say?");
+console.log("=" .repeat(70));
+
+// For each model, when it's the category outlier, what label does it give vs the majority?
+for (const outlierModel of MODELS) {
+ const wrongLabelDist: Record> = {};
+ // wrongLabelDist[majorityLabel][outlierLabel] = count
+
+ for (const [, models] of complete) {
+ const cats = MODELS.map((m) => models.get(m)!.label.content_category);
+ const others = MODELS.filter((m) => m !== outlierModel);
+ const otherCats = others.map((m) => models.get(m)!.label.content_category);
+
+ if (otherCats[0] !== otherCats[1]) continue; // not a 2v1 with this model as outlier
+ const majority = otherCats[0];
+ const outlierCat = models.get(outlierModel)!.label.content_category;
+ if (outlierCat === majority) continue; // this model agrees
+
+ if (!wrongLabelDist[majority]) wrongLabelDist[majority] = {};
+ wrongLabelDist[majority][outlierCat] = (wrongLabelDist[majority][outlierCat] ?? 0) + 1;
+ }
+
+ console.log(`\n${SHORT[outlierModel]} as outlier — what it says vs majority:`);
+ const majorityLabels = Object.keys(wrongLabelDist).sort();
+ if (majorityLabels.length === 0) {
+ console.log(" (no outlier cases)");
+ continue;
+ }
+ for (const maj of majorityLabels) {
+ const entries = Object.entries(wrongLabelDist[maj]).sort((a, b) => b[1] - a[1]);
+ const total = entries.reduce((s, [, n]) => s + n, 0);
+ console.log(` Majority="${maj}" (${total} cases):`);
+ for (const [label, count] of entries) {
+ console.log(` -> "${label}": ${count} (${pct(count, total)})`);
+ }
+ }
+}
+
+// ── 6. Spec 4 Analysis ──────────────────────────────────────────────────
+
+console.log("\n" + "=" .repeat(70));
+console.log("6. SPEC 4 ANALYSIS: Who disagrees when majority says Spec 4?");
+console.log("=" .repeat(70));
+
+const spec4Outliers: Record> = {} as any;
+for (const m of MODELS) spec4Outliers[m] = {};
+let spec4DisagreeTotal = 0;
+
+for (const [, models] of complete) {
+ const specs = MODELS.map((m) => models.get(m)!.label.specificity_level);
+
+ // Find if majority is 4
+ const countOf4 = specs.filter((s) => s === 4).length;
+ if (countOf4 < 2) continue; // majority is not 4
+ if (countOf4 === 3) continue; // unanimous
+
+ // One model disagrees
+ for (let i = 0; i < 3; i++) {
+ if (specs[i] !== 4) {
+ spec4DisagreeTotal++;
+ spec4Outliers[MODELS[i]][specs[i]] = (spec4Outliers[MODELS[i]][specs[i]] ?? 0) + 1;
+ }
+ }
+}
+
+console.log(`\nTotal paragraphs where majority=Spec4 but one disagrees: ${spec4DisagreeTotal}\n`);
+for (const m of MODELS) {
+ const entries = Object.entries(spec4Outliers[m])
+ .map(([lvl, n]) => [Number(lvl), n] as [number, number])
+ .sort((a, b) => a[0] - b[0]);
+ const total = entries.reduce((s, [, n]) => s + n, 0);
+ if (total === 0) {
+ console.log(`${SHORT[m]}: never the outlier on Spec 4`);
+ continue;
+ }
+ console.log(`${SHORT[m]}: ${total} times the outlier (${pct(total, spec4DisagreeTotal)} of Spec4 disputes)`);
+ for (const [lvl, n] of entries) {
+ console.log(` -> says Spec ${lvl}: ${n} times`);
+ }
+}
+
+// ── 7. Management Role vs RMP Analysis ───────────────────────────────────
+
+console.log("\n" + "=" .repeat(70));
+console.log("7. MANAGEMENT ROLE vs RMP DISPUTES");
+console.log("=" .repeat(70));
+
+const mgmtRmpCounts: Record = {} as any;
+for (const m of MODELS) mgmtRmpCounts[m] = { management: 0, rmp: 0 };
+let mgmtRmpTotal = 0;
+
+for (const [, models] of complete) {
+ const cats = MODELS.map((m) => models.get(m)!.label.content_category);
+ const catSet = new Set(cats);
+
+ // Check if this is a Management vs RMP dispute
+ const hasMgmt = catSet.has("Management Role");
+ const hasRmp = catSet.has("Risk Management Process");
+ if (!hasMgmt || !hasRmp) continue;
+
+ // Only consider paragraphs where the dispute is specifically between these two
+ const relevantCats = cats.filter(
+ (c) => c === "Management Role" || c === "Risk Management Process",
+ );
+ if (relevantCats.length < 2) continue; // at least 2 models must be in this dispute
+
+ mgmtRmpTotal++;
+ for (const m of MODELS) {
+ const cat = models.get(m)!.label.content_category;
+ if (cat === "Management Role") mgmtRmpCounts[m].management++;
+ if (cat === "Risk Management Process") mgmtRmpCounts[m].rmp++;
+ }
+}
+
+console.log(`\nParagraphs with Management Role vs RMP dispute: ${mgmtRmpTotal}\n`);
+printTable(
+ ["Model", "Says Management", "Says RMP", "Says Other"],
+ MODELS.map((m) => {
+ const other = mgmtRmpTotal - mgmtRmpCounts[m].management - mgmtRmpCounts[m].rmp;
+ return [
+ SHORT[m],
+ `${mgmtRmpCounts[m].management} (${pct(mgmtRmpCounts[m].management, mgmtRmpTotal)})`,
+ `${mgmtRmpCounts[m].rmp} (${pct(mgmtRmpCounts[m].rmp, mgmtRmpTotal)})`,
+ `${other} (${pct(other, mgmtRmpTotal)})`,
+ ];
+ }),
+);
+
+console.log("\nDone.");
diff --git a/ts/scripts/model-probe.ts b/ts/scripts/model-probe.ts
new file mode 100644
index 0000000..d8ac1ba
--- /dev/null
+++ b/ts/scripts/model-probe.ts
@@ -0,0 +1,79 @@
+/**
+ * Quick probe: test which OpenRouter models support structured output.
+ * Sends one paragraph to each candidate model, reports success/failure.
+ */
+import { generateText, Output } from "ai";
+import { openrouter } from "../src/lib/openrouter.ts";
+import { LabelOutput } from "../src/schemas/label.ts";
+import { SYSTEM_PROMPT, buildUserPrompt } from "../src/label/prompts.ts";
+import type { Paragraph } from "../src/schemas/paragraph.ts";
+
+const TEST_PARAGRAPH: Paragraph = {
+ id: "00000000-0000-0000-0000-000000000001",
+ text: "The Board of Directors oversees the Company's management of cybersecurity risks. The Board has delegated oversight of cybersecurity and data privacy matters to the Audit Committee, which receives quarterly reports from the CISO.",
+ textHash: "test",
+ wordCount: 38,
+ paragraphIndex: 0,
+ filing: {
+ companyName: "Test Corp",
+ cik: "0000000001",
+ ticker: "TEST",
+ filingType: "10-K",
+ filingDate: "2024-03-15",
+ fiscalYear: 2023,
+ accessionNumber: "0000000001-24-000001",
+ secItem: "Item 1C",
+ },
+};
+
+const CANDIDATES = [
+ // Cheap/fast tier - good for Stage 1
+ "google/gemini-3.1-flash-lite-preview",
+ "x-ai/grok-4.1-fast",
+ "openai/gpt-4.1-mini",
+ "openai/gpt-4.1-nano",
+ "anthropic/claude-haiku-4.5",
+ "google/gemini-3.1-flash-preview",
+ "deepseek/deepseek-chat-v3-0324:free",
+ "meta-llama/llama-4-maverick",
+ "qwen/qwen3-235b-a22b",
+];
+
+async function testModel(modelId: string): Promise {
+ const start = Date.now();
+ try {
+ const result = await generateText({
+ model: openrouter(modelId),
+ output: Output.object({ schema: LabelOutput }),
+ system: SYSTEM_PROMPT,
+ prompt: buildUserPrompt(TEST_PARAGRAPH),
+ temperature: 0,
+ providerOptions: {
+ openrouter: {
+ reasoning: { effort: "low" },
+ usage: { include: true },
+ },
+ },
+ });
+
+ const latency = Date.now() - start;
+ const output = result.output;
+ const raw = result.usage as { raw?: { cost?: number } };
+ const cost = raw.raw?.cost ?? 0;
+
+ if (output) {
+ console.log(` ✓ ${modelId.padEnd(45)} ${latency}ms $${cost.toFixed(6)} → ${output.content_category}, spec=${output.specificity_level}`);
+ } else {
+ console.log(` ✖ ${modelId.padEnd(45)} ${latency}ms No output`);
+ }
+ } catch (error) {
+ const latency = Date.now() - start;
+ const msg = error instanceof Error ? error.message.slice(0, 80) : String(error);
+ console.log(` ✖ ${modelId.padEnd(45)} ${latency}ms ${msg}`);
+ }
+}
+
+console.log("Testing structured output support across OpenRouter models...\n");
+for (const model of CANDIDATES) {
+ await testModel(model);
+}
diff --git a/ts/scripts/pilot.ts b/ts/scripts/pilot.ts
new file mode 100644
index 0000000..8e78cf2
--- /dev/null
+++ b/ts/scripts/pilot.ts
@@ -0,0 +1,476 @@
+/**
+ * Prompt pilot: run all 3 Stage 1 models on a stratified sample of paragraphs.
+ *
+ * Usage:
+ * bun ts/scripts/pilot.ts [--n 40] [--seed 42] [--concurrency 5]
+ *
+ * Outputs:
+ * data/pilot/pilot-sample.jsonl — the sampled paragraphs
+ * data/pilot/pilot-results.jsonl — all annotations (3 per paragraph)
+ * data/pilot/pilot-report.txt — human-readable comparison report
+ */
+import { z } from "zod";
+import { readJsonl, writeJsonl, appendJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { STAGE1_MODELS } from "../src/lib/openrouter.ts";
+import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
+import { PROMPT_VERSION } from "../src/label/prompts.ts";
+import { v4 as uuidv4 } from "uuid";
+import { writeFile, mkdir } from "node:fs/promises";
+import { existsSync } from "node:fs";
+import pLimit from "p-limit";
+
+// ── Args ────────────────────────────────────────────────────────────────
+const args = process.argv.slice(2);
+function flag(name: string): string | undefined {
+ const idx = args.indexOf(`--${name}`);
+ return idx === -1 ? undefined : args[idx + 1];
+}
+const N = parseInt(flag("n") ?? "40", 10);
+const SEED = parseInt(flag("seed") ?? "42", 10);
+const CONCURRENCY = parseInt(flag("concurrency") ?? "5", 10);
+
+// ── Seeded PRNG (mulberry32) ────────────────────────────────────────────
+function mulberry32(seed: number) {
+ let s = seed | 0;
+ return () => {
+ s = (s + 0x6d2b79f5) | 0;
+ let t = Math.imul(s ^ (s >>> 15), 1 | s);
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+ };
+}
+
+// ── Stratified sampling ─────────────────────────────────────────────────
+function sampleStratified(paragraphs: Paragraph[], n: number, seed: number): Paragraph[] {
+ const rng = mulberry32(seed);
+
+ // Bucket by word count quartile + filing type
+ const buckets: Map = new Map();
+ for (const p of paragraphs) {
+ const wcBucket =
+ p.wordCount < 50 ? "short" :
+ p.wordCount < 100 ? "medium" :
+ p.wordCount < 200 ? "long" : "very-long";
+ const key = `${wcBucket}|${p.filing.filingType}`;
+ const arr = buckets.get(key) ?? [];
+ arr.push(p);
+ buckets.set(key, arr);
+ }
+
+ // Draw proportionally from each bucket
+ const sampled: Paragraph[] = [];
+ const bucketKeys = [...buckets.keys()].sort();
+ const perBucket = Math.max(1, Math.floor(n / bucketKeys.length));
+
+ for (const key of bucketKeys) {
+ const pool = buckets.get(key)!;
+ // Fisher-Yates shuffle with seeded RNG
+ for (let i = pool.length - 1; i > 0; i--) {
+ const j = Math.floor(rng() * (i + 1));
+ [pool[i], pool[j]] = [pool[j]!, pool[i]!];
+ }
+ sampled.push(...pool.slice(0, perBucket));
+ }
+
+ // Fill remaining from the full pool
+ if (sampled.length < n) {
+ const usedIds = new Set(sampled.map((p) => p.id));
+ const remaining = paragraphs.filter((p) => !usedIds.has(p.id));
+ for (let i = remaining.length - 1; i > 0; i--) {
+ const j = Math.floor(rng() * (i + 1));
+ [remaining[i], remaining[j]] = [remaining[j]!, remaining[i]!];
+ }
+ sampled.push(...remaining.slice(0, n - sampled.length));
+ }
+
+ return sampled.slice(0, n);
+}
+
+// ── Main ────────────────────────────────────────────────────────────────
+const PILOT_DIR = "../data/pilot";
+const TRAINING_PATH = "../data/paragraphs/training.jsonl";
+
+async function main() {
+ if (!existsSync(PILOT_DIR)) await mkdir(PILOT_DIR, { recursive: true });
+
+ // Load training data
+ console.error(`Loading training data from ${TRAINING_PATH}...`);
+ const { records: paragraphs, skipped } = await readJsonl(TRAINING_PATH, Paragraph);
+ if (skipped > 0) console.error(` ⚠ Skipped ${skipped} invalid lines`);
+ console.error(` Loaded ${paragraphs.length} paragraphs`);
+
+ // Sample
+ const sample = sampleStratified(paragraphs, N, SEED);
+ const samplePath = `${PILOT_DIR}/pilot-sample-${PROMPT_VERSION}.jsonl`;
+ await writeJsonl(samplePath, sample);
+ console.error(` Sampled ${sample.length} paragraphs (${samplePath})`);
+
+ // Show distribution
+ const filingTypes = new Map();
+ const wcBuckets = new Map();
+ for (const p of sample) {
+ filingTypes.set(p.filing.filingType, (filingTypes.get(p.filing.filingType) ?? 0) + 1);
+ const wc = p.wordCount < 50 ? "<50w" : p.wordCount < 100 ? "50-99w" : p.wordCount < 200 ? "100-199w" : "200+w";
+ wcBuckets.set(wc, (wcBuckets.get(wc) ?? 0) + 1);
+ }
+ console.error(` Filing types: ${[...filingTypes.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);
+ console.error(` Word counts: ${[...wcBuckets.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);
+
+ // Run all 3 models — with resume support
+ const runId = uuidv4();
+ const resultsPath = `${PILOT_DIR}/pilot-results-${PROMPT_VERSION}.jsonl`;
+ const limit = pLimit(CONCURRENCY);
+
+ type AnnotationResult = {
+ paragraphId: string;
+ modelId: string;
+ content_category: string;
+ specificity_level: number;
+ category_confidence: string;
+ specificity_confidence: string;
+ reasoning: string;
+ inputTokens: number;
+ outputTokens: number;
+ reasoningTokens: number;
+ costUsd: number;
+ latencyMs: number;
+ };
+
+ // Load existing results for resume
+ const doneKeys = new Set();
+ const allResults: AnnotationResult[] = [];
+ let totalCost = 0;
+
+ if (existsSync(resultsPath)) {
+ const { records: existing } = await readJsonl(resultsPath, z.object({
+ paragraphId: z.string(),
+ provenance: z.object({ modelId: z.string(), costUsd: z.number() }),
+ label: z.object({
+ content_category: z.string(),
+ specificity_level: z.number(),
+ category_confidence: z.string(),
+ specificity_confidence: z.string(),
+ reasoning: z.string(),
+ }),
+ }));
+ for (const rec of existing) {
+ doneKeys.add(`${rec.paragraphId}|${rec.provenance.modelId}`);
+ allResults.push({
+ paragraphId: rec.paragraphId,
+ modelId: rec.provenance.modelId,
+ content_category: rec.label.content_category,
+ specificity_level: rec.label.specificity_level,
+ category_confidence: rec.label.category_confidence,
+ specificity_confidence: rec.label.specificity_confidence,
+ reasoning: rec.label.reasoning,
+ inputTokens: 0, outputTokens: 0, reasoningTokens: 0,
+ costUsd: rec.provenance.costUsd,
+ latencyMs: 0,
+ });
+ totalCost += rec.provenance.costUsd;
+ }
+ if (doneKeys.size > 0) {
+ console.error(` Resuming: ${doneKeys.size} annotations already done, skipping`);
+ }
+ }
+
+ for (const modelId of STAGE1_MODELS) {
+ console.error(`\n ═══ ${modelId} ═══`);
+ const modelResults: AnnotationResult[] = [];
+ let modelCost = 0;
+ let modelInputTokens = 0;
+ let modelOutputTokens = 0;
+ let modelReasoningTokens = 0;
+
+ const tasks = sample.map((paragraph) =>
+ limit(async () => {
+ // Skip if already done (resume)
+ if (doneKeys.has(`${paragraph.id}|${modelId}`)) return;
+
+ const opts: AnnotateOpts = {
+ modelId,
+ stage: "stage1",
+ runId,
+ promptVersion: PROMPT_VERSION,
+ reasoningEffort: "low",
+ };
+
+ try {
+ const ann = await annotateParagraph(paragraph, opts);
+ const result: AnnotationResult = {
+ paragraphId: paragraph.id,
+ modelId,
+ content_category: ann.label.content_category,
+ specificity_level: ann.label.specificity_level,
+ category_confidence: ann.label.category_confidence,
+ specificity_confidence: ann.label.specificity_confidence,
+ reasoning: ann.label.reasoning,
+ inputTokens: ann.provenance.inputTokens,
+ outputTokens: ann.provenance.outputTokens,
+ reasoningTokens: ann.provenance.reasoningTokens,
+ costUsd: ann.provenance.costUsd,
+ latencyMs: ann.provenance.latencyMs,
+ };
+ modelResults.push(result);
+ allResults.push(result);
+ await appendJsonl(resultsPath, ann);
+
+ modelCost += ann.provenance.costUsd;
+ modelInputTokens += ann.provenance.inputTokens;
+ modelOutputTokens += ann.provenance.outputTokens;
+ modelReasoningTokens += ann.provenance.reasoningTokens;
+ totalCost += ann.provenance.costUsd;
+
+ const doneForModel = allResults.filter(r => r.modelId === modelId).length;
+ process.stderr.write(`\r ${doneForModel}/${sample.length} done $${modelCost.toFixed(4)}`);
+ } catch (error) {
+ console.error(`\n ✖ ${modelId} failed on ${paragraph.id}: ${error instanceof Error ? error.message : String(error)}`);
+ }
+ }),
+ );
+
+ await Promise.all(tasks);
+ console.error(
+ `\n ${modelId}: ${modelResults.length}/${sample.length} done` +
+ ` │ $${modelCost.toFixed(4)}` +
+ ` │ ${modelInputTokens.toLocaleString()} in / ${modelOutputTokens.toLocaleString()} out / ${modelReasoningTokens.toLocaleString()} reasoning`,
+ );
+ }
+
+ // ── Generate report ─────────────────────────────────────────────────
+ const report: string[] = [];
+ report.push(`SEC-cyBERT Prompt Pilot Report — ${new Date().toISOString()}`);
+ report.push(`Prompt version: ${PROMPT_VERSION}`);
+ report.push(`Sample: ${sample.length} paragraphs, seed=${SEED}`);
+ report.push(`Models: ${STAGE1_MODELS.join(", ")}`);
+ report.push(`Total cost: $${totalCost.toFixed(4)}`);
+ report.push("");
+
+ // Per-model stats
+ report.push("═══ PER-MODEL STATS ═══");
+ for (const modelId of STAGE1_MODELS) {
+ const modelAnns = allResults.filter((r) => r.modelId === modelId);
+ const cost = modelAnns.reduce((s, r) => s + r.costUsd, 0);
+ const inTok = modelAnns.reduce((s, r) => s + r.inputTokens, 0);
+ const outTok = modelAnns.reduce((s, r) => s + r.outputTokens, 0);
+ const reasonTok = modelAnns.reduce((s, r) => s + r.reasoningTokens, 0);
+ const avgLatency = modelAnns.length > 0
+ ? Math.round(modelAnns.reduce((s, r) => s + r.latencyMs, 0) / modelAnns.length)
+ : 0;
+
+ report.push(`\n${modelId}:`);
+ report.push(` Cost: $${cost.toFixed(4)} ($${(cost / modelAnns.length).toFixed(6)}/para)`);
+ report.push(` Tokens: ${inTok.toLocaleString()} in, ${outTok.toLocaleString()} out, ${reasonTok.toLocaleString()} reasoning`);
+ report.push(` Avg latency: ${avgLatency}ms`);
+
+ // Category distribution
+ const catCounts = new Map();
+ for (const r of modelAnns) {
+ catCounts.set(r.content_category, (catCounts.get(r.content_category) ?? 0) + 1);
+ }
+ report.push(` Categories: ${[...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
+
+ // Specificity distribution
+ const specCounts = new Map();
+ for (const r of modelAnns) {
+ specCounts.set(r.specificity_level, (specCounts.get(r.specificity_level) ?? 0) + 1);
+ }
+ report.push(` Specificity: ${[...specCounts.entries()].sort((a, b) => a[0] - b[0]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
+
+ // Confidence distribution
+ const catConf = new Map();
+ const specConf = new Map();
+ for (const r of modelAnns) {
+ catConf.set(r.category_confidence, (catConf.get(r.category_confidence) ?? 0) + 1);
+ specConf.set(r.specificity_confidence, (specConf.get(r.specificity_confidence) ?? 0) + 1);
+ }
+ report.push(` Category confidence: ${[...catConf.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);
+ report.push(` Specificity confidence: ${[...specConf.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`);
+ }
+
+ // Agreement analysis
+ report.push("\n\n═══ AGREEMENT ANALYSIS ═══");
+ const byParagraph = new Map();
+ for (const r of allResults) {
+ const arr = byParagraph.get(r.paragraphId) ?? [];
+ arr.push(r);
+ byParagraph.set(r.paragraphId, arr);
+ }
+
+ let catAgree3 = 0, catAgree2 = 0, catDisagreeAll = 0;
+ let specAgree3 = 0, specAgree2 = 0, specDisagreeAll = 0;
+ let bothAgree3 = 0;
+
+ for (const [, anns] of byParagraph) {
+ if (anns.length !== 3) continue;
+
+ // Category agreement
+ const cats = anns.map((a) => a.content_category);
+ const uniqueCats = new Set(cats).size;
+ if (uniqueCats === 1) catAgree3++;
+ else if (uniqueCats === 2) catAgree2++;
+ else catDisagreeAll++;
+
+ // Specificity agreement
+ const specs = anns.map((a) => a.specificity_level);
+ const uniqueSpecs = new Set(specs).size;
+ if (uniqueSpecs === 1) specAgree3++;
+ else if (uniqueSpecs === 2) specAgree2++;
+ else specDisagreeAll++;
+
+ // Both agree
+ if (uniqueCats === 1 && uniqueSpecs === 1) bothAgree3++;
+ }
+
+ const total = byParagraph.size;
+ report.push(`Paragraphs with all 3 models: ${total}`);
+ report.push("");
+ report.push(`Content Category Agreement:`);
+ report.push(` 3/3 unanimous: ${catAgree3}/${total} (${((catAgree3/total)*100).toFixed(1)}%)`);
+ report.push(` 2/3 majority: ${catAgree2}/${total} (${((catAgree2/total)*100).toFixed(1)}%)`);
+ report.push(` All disagree: ${catDisagreeAll}/${total} (${((catDisagreeAll/total)*100).toFixed(1)}%)`);
+ report.push("");
+ report.push(`Specificity Level Agreement:`);
+ report.push(` 3/3 unanimous: ${specAgree3}/${total} (${((specAgree3/total)*100).toFixed(1)}%)`);
+ report.push(` 2/3 majority: ${specAgree2}/${total} (${((specAgree2/total)*100).toFixed(1)}%)`);
+ report.push(` All disagree: ${specDisagreeAll}/${total} (${((specDisagreeAll/total)*100).toFixed(1)}%)`);
+ report.push("");
+ report.push(`Both dimensions 3/3: ${bothAgree3}/${total} (${((bothAgree3/total)*100).toFixed(1)}%)`);
+ report.push(`Consensus (2/3+ on both): ${total - catDisagreeAll}/${total} (${(((total - catDisagreeAll)/total)*100).toFixed(1)}%)`);
+
+ // Specificity spread (mean absolute deviation across 3 models per paragraph)
+ const spreads: number[] = [];
+ for (const [, anns] of byParagraph) {
+ if (anns.length !== 3) continue;
+ const specs = anns.map((a) => a.specificity_level);
+ const mean = specs.reduce((s, v) => s + v, 0) / specs.length;
+ const mad = specs.reduce((s, v) => s + Math.abs(v - mean), 0) / specs.length;
+ spreads.push(mad);
+ }
+ const meanSpread = spreads.reduce((s, v) => s + v, 0) / spreads.length;
+ const maxSpread = Math.max(...spreads);
+ report.push(`\nSpecificity spread (MAD): mean=${meanSpread.toFixed(3)}, max=${maxSpread.toFixed(3)}`);
+ report.push(` Spread=0 (perfect): ${spreads.filter(s => s === 0).length}/${total} (${((spreads.filter(s => s === 0).length/total)*100).toFixed(1)}%)`);
+ report.push(` Spread≤0.33 (1 off): ${spreads.filter(s => s <= 0.34).length}/${total}`);
+ report.push(` Spread>0.67 (2+ off): ${spreads.filter(s => s > 0.67).length}/${total}`);
+
+ // Pairwise agreement (category + specificity)
+ report.push("\nPairwise agreement:");
+ for (let i = 0; i < STAGE1_MODELS.length; i++) {
+ for (let j = i + 1; j < STAGE1_MODELS.length; j++) {
+ let catAgree = 0, specAgree = 0, count = 0;
+ for (const [, anns] of byParagraph) {
+ const a = anns.find((r) => r.modelId === STAGE1_MODELS[i]);
+ const b = anns.find((r) => r.modelId === STAGE1_MODELS[j]);
+ if (a && b) {
+ count++;
+ if (a.content_category === b.content_category) catAgree++;
+ if (a.specificity_level === b.specificity_level) specAgree++;
+ }
+ }
+ const short = (m: string) => m.split("/")[1]!;
+ report.push(` ${short(STAGE1_MODELS[i]!)} × ${short(STAGE1_MODELS[j]!)}: cat=${((catAgree/count)*100).toFixed(1)}%, spec=${((specAgree/count)*100).toFixed(1)}%`);
+ }
+ }
+
+ // Category confusion matrix (which categories get mixed up)
+ report.push("\n\n═══ CATEGORY DISAGREEMENT PATTERNS ═══");
+ const catConfusion = new Map();
+ for (const [, anns] of byParagraph) {
+ if (anns.length !== 3) continue;
+ const cats = anns.map((a) => a.content_category).sort();
+ const unique = new Set(cats);
+ if (unique.size > 1) {
+ const key = [...unique].sort().join(" ↔ ");
+ catConfusion.set(key, (catConfusion.get(key) ?? 0) + 1);
+ }
+ }
+ for (const [pair, count] of [...catConfusion.entries()].sort((a, b) => b[1] - a[1])) {
+ report.push(` ${pair}: ${count}`);
+ }
+
+ // Specificity disagreement patterns
+ report.push("\n═══ SPECIFICITY DISAGREEMENT PATTERNS ═══");
+ const specConfusion = new Map();
+ for (const [, anns] of byParagraph) {
+ if (anns.length !== 3) continue;
+ const specs = anns.map((a) => a.specificity_level).sort();
+ const unique = new Set(specs);
+ if (unique.size > 1) {
+ const key = specs.join(",");
+ specConfusion.set(key, (specConfusion.get(key) ?? 0) + 1);
+ }
+ }
+ for (const [pattern, count] of [...specConfusion.entries()].sort((a, b) => b[1] - a[1])) {
+ report.push(` [${pattern}]: ${count}`);
+ }
+
+ // Per-category specificity agreement
+ report.push("\n═══ PER-CATEGORY AGREEMENT (where all 3 agree on category) ═══");
+ const catSpecAgreement = new Map();
+ for (const [, anns] of byParagraph) {
+ if (anns.length !== 3) continue;
+ const cats = anns.map((a) => a.content_category);
+ if (new Set(cats).size !== 1) continue;
+ const cat = cats[0]!;
+ const entry = catSpecAgreement.get(cat) ?? { total: 0, specAgree: 0 };
+ entry.total++;
+ if (new Set(anns.map((a) => a.specificity_level)).size === 1) entry.specAgree++;
+ catSpecAgreement.set(cat, entry);
+ }
+ for (const [cat, { total, specAgree }] of [...catSpecAgreement.entries()].sort((a, b) => b[1].total - a[1].total)) {
+ report.push(` ${cat.padEnd(28)} spec agree: ${specAgree}/${total} (${((specAgree/total)*100).toFixed(1)}%)`);
+ }
+
+ // Per-paragraph detail for disagreements
+ report.push("\n\n═══ DISAGREEMENT DETAILS ═══");
+ for (const [pid, anns] of byParagraph) {
+ if (anns.length !== 3) continue;
+ const cats = new Set(anns.map((a) => a.content_category));
+ const specs = new Set(anns.map((a) => a.specificity_level));
+ if (cats.size === 1 && specs.size === 1) continue; // skip agreements
+
+ const paragraph = sample.find((p) => p.id === pid);
+ const textPreview = paragraph ? paragraph.text.slice(0, 200) + (paragraph.text.length > 200 ? "..." : "") : "(not found)";
+
+ report.push(`\n--- ${pid} ---`);
+ report.push(`Company: ${paragraph?.filing.companyName ?? "?"}`);
+ report.push(`Text: ${textPreview}`);
+ for (const a of anns) {
+ const short = a.modelId.split("/")[1]!;
+ report.push(` ${short.padEnd(30)} → ${a.content_category.padEnd(25)} spec=${a.specificity_level} (cat:${a.category_confidence}, spec:${a.specificity_confidence})`);
+ report.push(` ${" ".repeat(30)} ${a.reasoning}`);
+ }
+ }
+
+ // Cost projections
+ report.push("\n\n═══ COST PROJECTIONS (50K paragraphs) ═══");
+ for (const modelId of STAGE1_MODELS) {
+ const modelAnns = allResults.filter((r) => r.modelId === modelId);
+ if (modelAnns.length === 0) continue;
+ const costPerPara = modelAnns.reduce((s, r) => s + r.costUsd, 0) / modelAnns.length;
+ const projected = costPerPara * 50000;
+ report.push(` ${modelId}: $${projected.toFixed(2)} ($${costPerPara.toFixed(6)}/para)`);
+ }
+ const totalCostPerPara = totalCost / (sample.length * STAGE1_MODELS.length);
+ const projectedTotal = totalCostPerPara * 50000 * 3;
+ report.push(` TOTAL Stage 1 (all 3 models): ~$${projectedTotal.toFixed(2)}`);
+
+ // Estimated judge cost (~17% disagreement rate from codebook)
+ const disagreeRate = (catAgree2 + catDisagreeAll) / total;
+ report.push(`\n Observed disagreement rate: ${(disagreeRate * 100).toFixed(1)}%`);
+ report.push(` Estimated Stage 2 judge calls: ~${Math.round(50000 * disagreeRate).toLocaleString()}`);
+ report.push(` (Judge cost depends on Sonnet 4.6 pricing — see OpenRouter)`);
+
+ const reportText = report.join("\n");
+ await writeFile(`${PILOT_DIR}/pilot-report-${PROMPT_VERSION}.txt`, reportText);
+
+ // Print to stdout
+ console.log(reportText);
+}
+
+main().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
diff --git a/ts/scripts/sample-disputes.ts b/ts/scripts/sample-disputes.ts
new file mode 100644
index 0000000..4973c54
--- /dev/null
+++ b/ts/scripts/sample-disputes.ts
@@ -0,0 +1,229 @@
+/**
+ * Sample and print full paragraph text for the hardest dispute types.
+ *
+ * Prints 5 paragraphs from each of 4 dispute categories (20 total),
+ * with all 3 model annotations and company metadata.
+ *
+ * Usage: bun ts/scripts/sample-disputes.ts
+ */
+import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+
+const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
+const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+
+// ── Types ──────────────────────────────────────────────────────────────
+interface Ann {
+ paragraphId: string;
+ label: {
+ content_category: string;
+ specificity_level: number;
+ category_confidence: string;
+ specificity_confidence: string;
+ reasoning: string;
+ };
+ provenance: {
+ modelId: string;
+ };
+}
+
+type Paragraph = (typeof Paragraph)["_output"];
+
+// ── Helpers ────────────────────────────────────────────────────────────
+function majority(arr: T[]): { value: T; count: number } | null {
+ const counts = new Map();
+ for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
+ let best: T | null = null;
+ let bestCount = 0;
+ for (const [v, c] of counts) {
+ if (c > bestCount) { best = v; bestCount = c; }
+ }
+ return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
+}
+
+/** Deterministic sample: sort by ID, then take every Nth to get `count` items from diverse companies. */
+function deterministicSample(
+ candidates: { id: string; companyName: string }[],
+ count: number,
+): typeof candidates {
+ if (candidates.length <= count) return candidates;
+
+ // Sort by ID for determinism
+ const sorted = [...candidates].sort((a, b) => a.id.localeCompare(b.id));
+
+ // Greedily pick every Nth, skipping if company already seen (for diversity)
+ const step = Math.floor(sorted.length / (count * 3)); // oversample to allow skipping
+ const picked: typeof candidates = [];
+ const seenCompanies = new Set();
+
+ for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
+ const item = sorted[i];
+ if (!seenCompanies.has(item.companyName)) {
+ picked.push(item);
+ seenCompanies.add(item.companyName);
+ }
+ }
+
+ // If we couldn't get enough diverse companies, fill from remaining
+ if (picked.length < count) {
+ for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
+ if (!picked.includes(sorted[i])) {
+ picked.push(sorted[i]);
+ }
+ }
+ }
+
+ return picked.slice(0, count);
+}
+
+function shortModel(modelId: string): string {
+ // "google/gemini-3.1-flash-lite-preview" → "gemini-3.1-flash-lite"
+ const name = modelId.split("/").pop() ?? modelId;
+ return name.replace(/-preview$/, "").slice(0, 30);
+}
+
+function printSample(
+ para: Paragraph,
+ anns: Ann[],
+ index: number,
+) {
+ console.log(`\n${"─".repeat(80)}`);
+ console.log(` [${index}] ${para.filing.companyName} (${para.filing.ticker}) — ${para.filing.filingType} ${para.filing.filingDate}`);
+ console.log(` Paragraph ${para.paragraphIndex + 1}, ${para.wordCount} words, ID: ${para.id}`);
+ console.log(`${"─".repeat(80)}`);
+ console.log();
+ console.log(para.text);
+ console.log();
+
+ for (const a of anns) {
+ const model = shortModel(a.provenance.modelId);
+ console.log(` ┌─ ${model}`);
+ console.log(` │ Category: ${a.label.content_category} (${a.label.category_confidence})`);
+ console.log(` │ Specificity: ${a.label.specificity_level} (${a.label.specificity_confidence})`);
+ console.log(` │ Reasoning: ${a.label.reasoning}`);
+ console.log(` └─`);
+ }
+}
+
+// ── Main ───────────────────────────────────────────────────────────────
+async function main() {
+ console.log("Loading paragraphs...");
+ const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
+ console.log(` ${paragraphs.length.toLocaleString()} paragraphs`);
+
+ const paraById = new Map(paragraphs.map(p => [p.id, p]));
+
+ console.log("Loading annotations...");
+ const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
+ const anns = rawAnns as Ann[];
+ console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);
+
+ // Group annotations by paragraph
+ const byParagraph = new Map();
+ for (const a of anns) {
+ let arr = byParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+
+ // Count paragraphs per company
+ const companyParaCount = new Map();
+ for (const p of paragraphs) {
+ const name = p.filing.companyName;
+ companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
+ }
+
+ // ── Build candidate pools ────────────────────────────────────────────
+
+ // Pool 1: Spec [3,4] disputes — specs are [3,3,4] or [3,4,4]
+ const specDisputes: { id: string; companyName: string }[] = [];
+
+ // Pool 2: Management ↔ RMP disputes — 2 say one, 1 says the other
+ const mgmtRmpDisputes: { id: string; companyName: string }[] = [];
+
+ // Pool 3: None/Other ↔ Strategy disputes
+ const noneStrategyDisputes: { id: string; companyName: string }[] = [];
+
+ // Pool 4: Small company (1-3 paras) + unresolved (3-way cat split)
+ const smallUnresolved: { id: string; companyName: string }[] = [];
+
+ for (const [pid, pannAnns] of byParagraph) {
+ if (pannAnns.length < 3) continue;
+ const para = paraById.get(pid);
+ if (!para) continue;
+
+ const cats = pannAnns.map(a => a.label.content_category);
+ const specs = pannAnns.map(a => a.label.specificity_level);
+ const sortedSpecs = [...specs].sort((a, b) => a - b);
+ const sortedCats = [...cats].sort();
+
+ const companyName = para.filing.companyName;
+
+ // Pool 1: Spec [3,3,4] or [3,4,4]
+ const specKey = sortedSpecs.join(",");
+ if (specKey === "3,3,4" || specKey === "3,4,4") {
+ specDisputes.push({ id: pid, companyName });
+ }
+
+ // Pool 2: Management ↔ RMP (2-1 split in either direction)
+ const mgmtCount = cats.filter(c => c === "Management Role").length;
+ const rmpCount = cats.filter(c => c === "Risk Management Process").length;
+ if ((mgmtCount === 2 && rmpCount === 1) || (mgmtCount === 1 && rmpCount === 2)) {
+ mgmtRmpDisputes.push({ id: pid, companyName });
+ }
+
+ // Pool 3: None/Other ↔ Strategy Integration (2-1 split in either direction)
+ const noneCount = cats.filter(c => c === "None/Other").length;
+ const stratCount = cats.filter(c => c === "Strategy Integration").length;
+ if ((noneCount === 2 && stratCount === 1) || (noneCount === 1 && stratCount === 2)) {
+ noneStrategyDisputes.push({ id: pid, companyName });
+ }
+
+ // Pool 4: Small company (1-3 paras) with 3-way cat split (unresolved)
+ const paraCount = companyParaCount.get(companyName) ?? 0;
+ const uniqueCats = new Set(cats);
+ if (paraCount <= 3 && uniqueCats.size === 3) {
+ smallUnresolved.push({ id: pid, companyName });
+ }
+ }
+
+ console.log(`\nCandidate pools:`);
+ console.log(` Spec [3,4] disputes: ${specDisputes.length}`);
+ console.log(` Management ↔ RMP disputes: ${mgmtRmpDisputes.length}`);
+ console.log(` None/Other ↔ Strategy disputes: ${noneStrategyDisputes.length}`);
+ console.log(` Small co. unresolved: ${smallUnresolved.length}`);
+
+ // ── Sample and print ────────────────────────────────────────────────
+
+ const sections: [string, { id: string; companyName: string }[]][] = [
+ ["SPEC [3,4] DISPUTES — Models can't decide firm-specific vs quantified-verifiable", specDisputes],
+ ["MANAGEMENT ↔ RMP DISPUTES — 2-vs-1 split between Management Role and Risk Management Process", mgmtRmpDisputes],
+ ["NONE/OTHER ↔ STRATEGY INTEGRATION DISPUTES — 2-vs-1 split between None/Other and Strategy Integration", noneStrategyDisputes],
+ ["SMALL COMPANY (1-3 PARAS) UNRESOLVED — 3-way category split, tiny filings", smallUnresolved],
+ ];
+
+ let globalIdx = 1;
+ for (const [title, pool] of sections) {
+ console.log(`\n${"═".repeat(80)}`);
+ console.log(` ${title}`);
+ console.log(`${"═".repeat(80)}`);
+
+ const sampled = deterministicSample(pool, 5);
+ if (sampled.length === 0) {
+ console.log("\n (no candidates found)");
+ continue;
+ }
+
+ for (const item of sampled) {
+ const para = paraById.get(item.id)!;
+ const pannAnns = byParagraph.get(item.id)!;
+ printSample(para, pannAnns, globalIdx++);
+ }
+ }
+
+ console.log(`\n${"═".repeat(80)}`);
+ console.log(` Done — ${globalIdx - 1} paragraphs printed.`);
+ console.log(`${"═".repeat(80)}`);
+}
+
+main().catch(console.error);
diff --git a/ts/scripts/segment-analysis.ts b/ts/scripts/segment-analysis.ts
new file mode 100644
index 0000000..1133d23
--- /dev/null
+++ b/ts/scripts/segment-analysis.ts
@@ -0,0 +1,432 @@
+/**
+ * Cross-tabulate agreement status against all paragraph metadata dimensions.
+ *
+ * Segments every paragraph into: unanimous | majority | unresolved
+ * Then breaks down by: fiscal year, filing type, sec item, category,
+ * specificity, confidence, company size (paragraph count proxy),
+ * word count quintile, and cross-dimensions.
+ *
+ * Usage: bun ts/scripts/segment-analysis.ts
+ */
+import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+
+const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
+const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+
+// ── Types ──────────────────────────────────────────────────────────────
+interface Ann {
+ paragraphId: string;
+ label: {
+ content_category: string;
+ specificity_level: number;
+ category_confidence: string;
+ specificity_confidence: string;
+ reasoning: string;
+ };
+ provenance: {
+ modelId: string;
+ costUsd: number;
+ inputTokens: number;
+ outputTokens: number;
+ reasoningTokens: number;
+ latencyMs: number;
+ requestedAt: string;
+ };
+}
+
+type Segment = "unanimous" | "majority" | "unresolved";
+
+interface ParagraphAnalysis {
+ id: string;
+ segment: Segment;
+ catSegment: "cat-unanimous" | "cat-majority" | "cat-split";
+ specSegment: "spec-unanimous" | "spec-majority" | "spec-split";
+ majorityCat: string;
+ majoritySpec: number;
+ cats: string[];
+ specs: number[];
+ catConfidences: string[];
+ specConfidences: string[];
+ // Filing metadata
+ companyName: string;
+ ticker: string;
+ filingType: string;
+ filingDate: string;
+ fiscalYear: number;
+ secItem: string;
+ wordCount: number;
+}
+
+// ── Helpers ────────────────────────────────────────────────────────────
+function pct(n: number, total: number): string {
+ return total === 0 ? "0.0%" : `${((n / total) * 100).toFixed(1)}%`;
+}
+
+function majority(arr: T[]): { value: T; count: number } | null {
+ const counts = new Map();
+ for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
+ let best: T | null = null;
+ let bestCount = 0;
+ for (const [v, c] of counts) {
+ if (c > bestCount) { best = v; bestCount = c; }
+ }
+ return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
+}
+
+function printDistribution(label: string, counts: Map) {
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(` ${label}`);
+ console.log(`${"═".repeat(70)}`);
+
+ const sorted = [...counts.entries()].sort(([, a], [, b]) => b.total - a.total);
+ const maxKeyLen = Math.max(...sorted.map(([k]) => k.length), 20);
+
+ console.log(
+ ` ${"".padEnd(maxKeyLen)} ${"Total".padStart(7)} ${"Unan".padStart(7)} ${"Maj".padStart(7)} ${"Unres".padStart(7)} ${"Unan%".padStart(7)} ${"Unres%".padStart(7)}`
+ );
+ console.log(` ${"─".repeat(maxKeyLen + 50)}`);
+
+ for (const [key, v] of sorted) {
+ console.log(
+ ` ${key.padEnd(maxKeyLen)} ${String(v.total).padStart(7)} ${String(v.unanimous).padStart(7)} ${String(v.majority).padStart(7)} ${String(v.unresolved).padStart(7)} ${pct(v.unanimous, v.total).padStart(7)} ${pct(v.unresolved, v.total).padStart(7)}`
+ );
+ }
+}
+
+function printCrossTab(label: string, rows: Map>, colOrder?: string[]) {
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(` ${label}`);
+ console.log(`${"═".repeat(70)}`);
+
+ const allCols = colOrder ?? [...new Set([...rows.values()].flatMap(m => [...m.keys()]))].sort();
+ const maxKeyLen = Math.max(...[...rows.keys()].map(k => k.length), 15);
+ const colWidth = 8;
+
+ console.log(
+ ` ${"".padEnd(maxKeyLen)} ${allCols.map(c => c.slice(0, colWidth).padStart(colWidth)).join(" ")}`
+ );
+ console.log(` ${"─".repeat(maxKeyLen + (colWidth + 2) * allCols.length)}`);
+
+ const sortedRows = [...rows.entries()].sort(([a], [b]) => a.localeCompare(b));
+ for (const [key, cols] of sortedRows) {
+ const total = [...cols.values()].reduce((a, b) => a + b, 0);
+ const cells = allCols.map(c => {
+ const n = cols.get(c) ?? 0;
+ return `${pct(n, total)}`.padStart(colWidth);
+ });
+ console.log(` ${key.padEnd(maxKeyLen)} ${cells.join(" ")} (n=${total})`);
+ }
+}
+
+// ── Main ───────────────────────────────────────────────────────────────
+async function main() {
+ console.log("Loading paragraphs...");
+ const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
+ console.log(` ${paragraphs.length.toLocaleString()} paragraphs`);
+
+ const paraById = new Map(paragraphs.map(p => [p.id, p]));
+
+ console.log("Loading annotations...");
+ const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
+ const anns = rawAnns as Ann[];
+ console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);
+
+ // Group annotations by paragraph
+ const byParagraph = new Map();
+ for (const a of anns) {
+ let arr = byParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+
+ // Count paragraphs per company (for company-size bucketing)
+ const companyParaCount = new Map();
+ for (const p of paragraphs) {
+ const name = p.filing.companyName;
+ companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
+ }
+
+ // ── Analyze each paragraph ──────────────────────────────────────────
+ const analyzed: ParagraphAnalysis[] = [];
+
+ for (const [pid, pannAnns] of byParagraph) {
+ if (pannAnns.length < 3) continue;
+ const para = paraById.get(pid);
+ if (!para) continue;
+
+ const cats = pannAnns.map(a => a.label.content_category);
+ const specs = pannAnns.map(a => a.label.specificity_level);
+ const catConfidences = pannAnns.map(a => a.label.category_confidence);
+ const specConfidences = pannAnns.map(a => a.label.specificity_confidence);
+
+ const catMaj = majority(cats);
+ const specMaj = majority(specs);
+
+ const catSeg = cats.every(c => c === cats[0]) ? "cat-unanimous"
+ : catMaj ? "cat-majority" : "cat-split";
+ const specSeg = specs.every(s => s === specs[0]) ? "spec-unanimous"
+ : specMaj ? "spec-majority" : "spec-split";
+
+ let segment: Segment;
+ if (catSeg === "cat-unanimous" && specSeg === "spec-unanimous") {
+ segment = "unanimous";
+ } else if (catMaj && specMaj) {
+ segment = "majority";
+ } else {
+ segment = "unresolved";
+ }
+
+ analyzed.push({
+ id: pid,
+ segment,
+ catSegment: catSeg,
+ specSegment: specSeg,
+ majorityCat: catMaj?.value ?? cats[0],
+ majoritySpec: specMaj?.value ?? specs[0],
+ cats,
+ specs,
+ catConfidences,
+ specConfidences,
+ companyName: para.filing.companyName,
+ ticker: para.filing.ticker,
+ filingType: para.filing.filingType,
+ filingDate: para.filing.filingDate,
+ fiscalYear: para.filing.fiscalYear,
+ secItem: para.filing.secItem,
+ wordCount: para.wordCount,
+ });
+ }
+
+ console.log(`\n${analyzed.length.toLocaleString()} paragraphs analyzed\n`);
+
+ // ── Overview ─────────────────────────────────────────────────────────
+ const segCounts = { unanimous: 0, majority: 0, unresolved: 0 };
+ for (const a of analyzed) segCounts[a.segment]++;
+ console.log("SEGMENT OVERVIEW:");
+ console.log(` Unanimous: ${segCounts.unanimous.toLocaleString()} (${pct(segCounts.unanimous, analyzed.length)})`);
+ console.log(` Majority: ${segCounts.majority.toLocaleString()} (${pct(segCounts.majority, analyzed.length)})`);
+ console.log(` Unresolved: ${segCounts.unresolved.toLocaleString()} (${pct(segCounts.unresolved, analyzed.length)})`);
+
+ // Cat vs spec disagreement breakdown
+ const catSpecBreakdown = { catOnly: 0, specOnly: 0, both: 0 };
+ for (const a of analyzed) {
+ if (a.segment === "unanimous") continue;
+ const catDis = a.catSegment !== "cat-unanimous";
+ const specDis = a.specSegment !== "spec-unanimous";
+ if (catDis && specDis) catSpecBreakdown.both++;
+ else if (catDis) catSpecBreakdown.catOnly++;
+ else catSpecBreakdown.specOnly++;
+ }
+ const disputed = segCounts.majority + segCounts.unresolved;
+ console.log(`\n Disagreement breakdown (of ${disputed.toLocaleString()} non-unanimous):`);
+ console.log(` Category only: ${catSpecBreakdown.catOnly.toLocaleString()} (${pct(catSpecBreakdown.catOnly, disputed)})`);
+ console.log(` Specificity only: ${catSpecBreakdown.specOnly.toLocaleString()} (${pct(catSpecBreakdown.specOnly, disputed)})`);
+ console.log(` Both: ${catSpecBreakdown.both.toLocaleString()} (${pct(catSpecBreakdown.both, disputed)})`);
+
+ // ── Distribution functions ──────────────────────────────────────────
+ function buildDist(keyFn: (a: ParagraphAnalysis) => string) {
+ const dist = new Map();
+ for (const a of analyzed) {
+ const key = keyFn(a);
+ let entry = dist.get(key);
+ if (!entry) { entry = { total: 0, unanimous: 0, majority: 0, unresolved: 0 }; dist.set(key, entry); }
+ entry.total++;
+ entry[a.segment]++;
+ }
+ return dist;
+ }
+
+ // ── 1. By fiscal year ───────────────────────────────────────────────
+ printDistribution("BY FISCAL YEAR", buildDist(a => String(a.fiscalYear)));
+
+ // ── 2. By filing type ───────────────────────────────────────────────
+ printDistribution("BY FILING TYPE", buildDist(a => a.filingType));
+
+ // ── 3. By SEC item ──────────────────────────────────────────────────
+ printDistribution("BY SEC ITEM", buildDist(a => a.secItem));
+
+ // ── 4. By majority category ─────────────────────────────────────────
+ printDistribution("BY MAJORITY CATEGORY", buildDist(a => a.majorityCat));
+
+ // ── 5. By majority specificity ──────────────────────────────────────
+ const specLabels: Record = {
+ 1: "1-Generic", 2: "2-Sector", 3: "3-Firm", 4: "4-Quantified"
+ };
+ printDistribution("BY MAJORITY SPECIFICITY", buildDist(a => specLabels[a.majoritySpec] ?? String(a.majoritySpec)));
+
+ // ── 6. By confidence pattern ────────────────────────────────────────
+ printDistribution("BY CATEGORY CONFIDENCE PATTERN",
+ buildDist(a => a.catConfidences.sort().join("/")));
+ printDistribution("BY SPECIFICITY CONFIDENCE PATTERN",
+ buildDist(a => a.specConfidences.sort().join("/")));
+
+ // ── 7. By word count quintile ───────────────────────────────────────
+ const wordCounts = analyzed.map(a => a.wordCount).sort((a, b) => a - b);
+ const q20 = wordCounts[Math.floor(wordCounts.length * 0.2)];
+ const q40 = wordCounts[Math.floor(wordCounts.length * 0.4)];
+ const q60 = wordCounts[Math.floor(wordCounts.length * 0.6)];
+ const q80 = wordCounts[Math.floor(wordCounts.length * 0.8)];
+ console.log(`\n Word count quintile boundaries: ${q20}, ${q40}, ${q60}, ${q80}`);
+ printDistribution("BY WORD COUNT QUINTILE", buildDist(a => {
+ if (a.wordCount <= q20) return `Q1 (≤${q20})`;
+ if (a.wordCount <= q40) return `Q2 (${q20+1}-${q40})`;
+ if (a.wordCount <= q60) return `Q3 (${q40+1}-${q60})`;
+ if (a.wordCount <= q80) return `Q4 (${q60+1}-${q80})`;
+ return `Q5 (>${q80})`;
+ }));
+
+ // ── 8. By company size bucket ───────────────────────────────────────
+ printDistribution("BY COMPANY SIZE (paragraph count)", buildDist(a => {
+ const n = companyParaCount.get(a.companyName) ?? 0;
+ if (n <= 3) return "1-3 paras";
+ if (n <= 6) return "4-6 paras";
+ if (n <= 10) return "7-10 paras";
+ if (n <= 20) return "11-20 paras";
+ return "21+ paras";
+ }));
+
+ // ── 9. Cross-tab: category × segment ────────────────────────────────
+ const catBySegment = new Map>();
+ for (const a of analyzed) {
+ const key = a.majorityCat;
+ let row = catBySegment.get(key);
+ if (!row) { row = new Map(); catBySegment.set(key, row); }
+ row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
+ }
+ printCrossTab("CATEGORY × SEGMENT", catBySegment, ["unanimous", "majority", "unresolved"]);
+
+ // ── 10. Cross-tab: specificity × segment ────────────────────────────
+ const specBySegment = new Map>();
+ for (const a of analyzed) {
+ const key = specLabels[a.majoritySpec] ?? String(a.majoritySpec);
+ let row = specBySegment.get(key);
+ if (!row) { row = new Map(); specBySegment.set(key, row); }
+ row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
+ }
+ printCrossTab("SPECIFICITY × SEGMENT", specBySegment, ["unanimous", "majority", "unresolved"]);
+
+ // ── 11. Cross-tab: fiscal year × category (for non-unanimous) ──────
+ const yearByCat = new Map>();
+ for (const a of analyzed) {
+ if (a.segment === "unanimous") continue;
+ const key = String(a.fiscalYear);
+ let row = yearByCat.get(key);
+ if (!row) { row = new Map(); yearByCat.set(key, row); }
+ row.set(a.majorityCat, (row.get(a.majorityCat) ?? 0) + 1);
+ }
+ printCrossTab("FISCAL YEAR × CATEGORY (non-unanimous only)", yearByCat);
+
+ // ── 12. Top disagreement companies ──────────────────────────────────
+ const companyDisagree = new Map();
+ for (const a of analyzed) {
+ let entry = companyDisagree.get(a.companyName);
+ if (!entry) { entry = { total: 0, disputed: 0 }; companyDisagree.set(a.companyName, entry); }
+ entry.total++;
+ if (a.segment !== "unanimous") entry.disputed++;
+ }
+
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" TOP 30 COMPANIES BY DISAGREEMENT RATE (min 5 paragraphs)");
+ console.log(`${"═".repeat(70)}`);
+ const companyRanked = [...companyDisagree.entries()]
+ .filter(([, v]) => v.total >= 5)
+ .map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
+ .sort((a, b) => b.rate - a.rate)
+ .slice(0, 30);
+
+ for (const c of companyRanked) {
+ console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
+ }
+
+ // ── 13. Bottom 30 companies (lowest disagreement) ──────────────────
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" TOP 30 COMPANIES BY AGREEMENT RATE (min 5 paragraphs)");
+ console.log(`${"═".repeat(70)}`);
+ const companyAgreed = [...companyDisagree.entries()]
+ .filter(([, v]) => v.total >= 5)
+ .map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
+ .sort((a, b) => a.rate - b.rate)
+ .slice(0, 30);
+
+ for (const c of companyAgreed) {
+ console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
+ }
+
+ // ── 14. Specificity spread analysis ─────────────────────────────────
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" SPECIFICITY SPREAD (max - min) FOR NON-UNANIMOUS");
+ console.log(`${"═".repeat(70)}`);
+ const specSpread = new Map();
+ for (const a of analyzed) {
+ if (a.specSegment === "spec-unanimous") continue;
+ const spread = Math.max(...a.specs) - Math.min(...a.specs);
+ const key = `spread-${spread}`;
+ specSpread.set(key, (specSpread.get(key) ?? 0) + 1);
+ }
+ for (const [key, count] of [...specSpread.entries()].sort()) {
+ console.log(` ${key}: ${count.toLocaleString()}`);
+ }
+
+ // ── 15. Most common category dispute pairs ──────────────────────────
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" TOP CATEGORY DISPUTE PAIRS");
+ console.log(`${"═".repeat(70)}`);
+ const catPairs = new Map();
+ for (const a of analyzed) {
+ if (a.catSegment === "cat-unanimous") continue;
+ const sorted = [...new Set(a.cats)].sort();
+ const key = sorted.join(" ↔ ");
+ catPairs.set(key, (catPairs.get(key) ?? 0) + 1);
+ }
+ for (const [pair, count] of [...catPairs.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
+ console.log(` ${pair.padEnd(55)} ${count.toLocaleString()}`);
+ }
+
+ // ── 16. Spec dispute patterns ───────────────────────────────────────
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" TOP SPECIFICITY DISPUTE PATTERNS");
+ console.log(`${"═".repeat(70)}`);
+ const specPatterns = new Map();
+ for (const a of analyzed) {
+ if (a.specSegment === "spec-unanimous") continue;
+ const sorted = [...a.specs].sort((a, b) => a - b);
+ const key = `[${sorted.join(",")}]`;
+ specPatterns.set(key, (specPatterns.get(key) ?? 0) + 1);
+ }
+ for (const [pattern, count] of [...specPatterns.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
+ console.log(` ${pattern.padEnd(20)} ${count.toLocaleString()}`);
+ }
+
+ // ── 17. Confidence vs agreement rate ────────────────────────────────
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" AVERAGE CONFIDENCE BY SEGMENT");
+ console.log(`${"═".repeat(70)}`);
+ const confScore = (c: string) => c === "high" ? 3 : c === "medium" ? 2 : 1;
+ for (const seg of ["unanimous", "majority", "unresolved"] as const) {
+ const group = analyzed.filter(a => a.segment === seg);
+ const avgCatConf = group.reduce((s, a) => s + a.catConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
+ const avgSpecConf = group.reduce((s, a) => s + a.specConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
+ console.log(` ${seg.padEnd(12)} avg cat conf: ${avgCatConf.toFixed(2)} avg spec conf: ${avgSpecConf.toFixed(2)}`);
+ }
+
+ // ── 18. All-low-confidence counts ───────────────────────────────────
+ console.log(`\n${"═".repeat(70)}`);
+ console.log(" ALL-LOW-CONFIDENCE PATTERNS");
+ console.log(`${"═".repeat(70)}`);
+ const allLowCat = analyzed.filter(a => a.catConfidences.every(c => c === "low"));
+ const allLowSpec = analyzed.filter(a => a.specConfidences.every(c => c === "low"));
+ const allLowBoth = analyzed.filter(a => a.catConfidences.every(c => c === "low") && a.specConfidences.every(c => c === "low"));
+ console.log(` All-low cat confidence: ${allLowCat.length} (${pct(allLowCat.length, analyzed.length)})`);
+ console.log(` All-low spec confidence: ${allLowSpec.length} (${pct(allLowSpec.length, analyzed.length)})`);
+ console.log(` All-low both: ${allLowBoth.length} (${pct(allLowBoth.length, analyzed.length)})`);
+
+ // Of those, segment distribution
+ for (const [label, group] of [["All-low cat", allLowCat], ["All-low spec", allLowSpec]] as const) {
+ const segDist = { unanimous: 0, majority: 0, unresolved: 0 };
+ for (const a of group) segDist[a.segment]++;
+ console.log(` ${label} → unanimous: ${segDist.unanimous}, majority: ${segDist.majority}, unresolved: ${segDist.unresolved}`);
+ }
+}
+
+main().catch(console.error);
diff --git a/ts/scripts/stage1-analyze.ts b/ts/scripts/stage1-analyze.ts
new file mode 100644
index 0000000..8ec1a8b
--- /dev/null
+++ b/ts/scripts/stage1-analyze.ts
@@ -0,0 +1,538 @@
+/**
+ * Deep analysis of Stage 1 annotation data.
+ *
+ * Usage: bun ts/scripts/stage1-analyze.ts
+ */
+import { readJsonlRaw } from "../src/lib/jsonl.ts";
+
+const INPUT = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
+
+// ── Types ──────────────────────────────────────────────────────────────
+interface Ann {
+ paragraphId: string;
+ label: {
+ content_category: string;
+ specificity_level: number;
+ category_confidence: string;
+ specificity_confidence: string;
+ reasoning: string;
+ };
+ provenance: {
+ modelId: string;
+ costUsd: number;
+ inputTokens: number;
+ outputTokens: number;
+ reasoningTokens: number;
+ latencyMs: number;
+ requestedAt: string;
+ };
+}
+
+type ModelAnns = Map; // paragraphId → annotations
+
+// ── Helpers ────────────────────────────────────────────────────────────
+function pct(n: number, total: number): string {
+ return `${((n / total) * 100).toFixed(1)}%`;
+}
+
+function median(arr: number[]): number {
+ const sorted = [...arr].sort((a, b) => a - b);
+ const mid = Math.floor(sorted.length / 2);
+ return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
+}
+
+function mean(arr: number[]): number {
+ return arr.reduce((a, b) => a + b, 0) / arr.length;
+}
+
+function stddev(arr: number[]): number {
+ const m = mean(arr);
+ return Math.sqrt(arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / arr.length);
+}
+
+function percentile(arr: number[], p: number): number {
+ const sorted = [...arr].sort((a, b) => a - b);
+ const idx = (p / 100) * (sorted.length - 1);
+ const lo = Math.floor(idx);
+ const hi = Math.ceil(idx);
+ return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
+}
+
+// ── Main ───────────────────────────────────────────────────────────────
+async function main() {
+ console.log("Loading annotations...");
+ const { records: raw, skipped } = await readJsonlRaw(INPUT);
+ const anns = raw as Ann[];
+ console.log(` ${anns.length.toLocaleString()} annotations loaded, ${skipped} skipped\n`);
+
+ // Group by paragraph
+ const byParagraph = new Map();
+ for (const a of anns) {
+ let arr = byParagraph.get(a.paragraphId);
+ if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
+ arr.push(a);
+ }
+
+ // Group by model
+ const byModel = new Map();
+ for (const a of anns) {
+ let arr = byModel.get(a.provenance.modelId);
+ if (!arr) { arr = []; byModel.set(a.provenance.modelId, arr); }
+ arr.push(a);
+ }
+
+ const modelNames = [...byModel.keys()].sort();
+ const shortName = (m: string) => m.split("/").pop()!;
+ const nParagraphs = byParagraph.size;
+
+ // ════════════════════════════════════════════════════════════════════
+ // 1. OVERVIEW
+ // ════════════════════════════════════════════════════════════════════
+ console.log("═══════════════════════════════════════════════════════════");
+ console.log(" STAGE 1 DEEP ANALYSIS");
+ console.log("═══════════════════════════════════════════════════════════\n");
+
+ console.log("── Overview ──────────────────────────────────────────────");
+ console.log(` Paragraphs: ${nParagraphs.toLocaleString()}`);
+ console.log(` Annotations: ${anns.length.toLocaleString()}`);
+ console.log(` Models: ${modelNames.map(shortName).join(", ")}`);
+
+ let totalCost = 0, totalInput = 0, totalOutput = 0, totalReasoning = 0;
+ for (const a of anns) {
+ totalCost += a.provenance.costUsd;
+ totalInput += a.provenance.inputTokens;
+ totalOutput += a.provenance.outputTokens;
+ totalReasoning += a.provenance.reasoningTokens;
+ }
+ console.log(` Total cost: $${totalCost.toFixed(2)}`);
+ console.log(` Input tokens: ${(totalInput / 1e6).toFixed(1)}M`);
+ console.log(` Output tokens: ${(totalOutput / 1e6).toFixed(1)}M`);
+ console.log(` Reasoning: ${(totalReasoning / 1e6).toFixed(1)}M`);
+
+ // ════════════════════════════════════════════════════════════════════
+ // 2. PER-MODEL STATS
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n── Per-Model Statistics ───────────────────────────────────");
+ for (const model of modelNames) {
+ const mas = byModel.get(model)!;
+ const costs = mas.map(a => a.provenance.costUsd);
+ const latencies = mas.map(a => a.provenance.latencyMs);
+ const outputs = mas.map(a => a.provenance.outputTokens);
+ console.log(`\n ${shortName(model)} (n=${mas.length.toLocaleString()}):`);
+ console.log(` Cost: $${costs.reduce((a, b) => a + b, 0).toFixed(2)} total, $${mean(costs).toFixed(5)}/ann`);
+ console.log(` Latency: median ${median(latencies).toFixed(0)}ms, p95 ${percentile(latencies, 95).toFixed(0)}ms, p99 ${percentile(latencies, 99).toFixed(0)}ms`);
+ console.log(` Output: median ${median(outputs).toFixed(0)} tokens, mean ${mean(outputs).toFixed(0)}`);
+
+ // Category distribution
+ const catCounts = new Map();
+ const specCounts = new Map();
+ const confCatCounts = new Map();
+ const confSpecCounts = new Map();
+ for (const a of mas) {
+ catCounts.set(a.label.content_category, (catCounts.get(a.label.content_category) ?? 0) + 1);
+ specCounts.set(a.label.specificity_level, (specCounts.get(a.label.specificity_level) ?? 0) + 1);
+ confCatCounts.set(a.label.category_confidence, (confCatCounts.get(a.label.category_confidence) ?? 0) + 1);
+ confSpecCounts.set(a.label.specificity_confidence, (confSpecCounts.get(a.label.specificity_confidence) ?? 0) + 1);
+ }
+ console.log(` Categories: ${[...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
+ console.log(` Specificity: ${[...specCounts.entries()].sort((a, b) => a[0] - b[0]).map(([k, v]) => `${k}=${v} (${pct(v, mas.length)})`).join(", ")}`);
+ console.log(` Cat confidence: ${[...confCatCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
+ console.log(` Spec confidence: ${[...confSpecCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 3. AGREEMENT ANALYSIS
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Agreement Analysis ─────────────────────────────────────");
+
+ let catUnanimous = 0, specUnanimous = 0, bothUnanimous = 0;
+ let catMajority = 0, specMajority = 0, bothMajority = 0;
+ let catNoMajority = 0, specNoMajority = 0;
+ const specSpreads: number[] = [];
+
+ // Category confusion tracking
+ const catDisagreementPairs = new Map();
+ const specDisagreementPatterns = new Map();
+
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+
+ const cats = panns.map(a => a.label.content_category);
+ const specs = panns.map(a => a.label.specificity_level);
+
+ // Category agreement
+ const catSet = new Set(cats);
+ const catUnan = catSet.size === 1;
+ const catMaj = cats.filter(c => c === cats[0]).length >= 2 ||
+ cats.filter(c => c === cats[1]).length >= 2;
+ if (catUnan) catUnanimous++;
+ if (catMaj) catMajority++;
+ if (!catMaj) {
+ catNoMajority++;
+ }
+
+ // Track category disagreement pairs
+ if (!catUnan) {
+ const sorted = [...cats].sort();
+ for (let i = 0; i < sorted.length; i++) {
+ for (let j = i + 1; j < sorted.length; j++) {
+ if (sorted[i] !== sorted[j]) {
+ const key = `${sorted[i]} ↔ ${sorted[j]}`;
+ catDisagreementPairs.set(key, (catDisagreementPairs.get(key) ?? 0) + 1);
+ }
+ }
+ }
+ }
+
+ // Specificity agreement
+ const specSet = new Set(specs);
+ const specUnan = specSet.size === 1;
+ const specMaj0 = specs.filter(s => s === specs[0]).length >= 2 ||
+ specs.filter(s => s === specs[1]).length >= 2;
+ if (specUnan) specUnanimous++;
+ if (specMaj0) specMajority++;
+ if (!specMaj0) specNoMajority++;
+
+ // Specificity spread (MAD)
+ const specMedian = median(specs);
+ const mad = mean(specs.map(s => Math.abs(s - specMedian)));
+ specSpreads.push(mad);
+
+ // Track specificity disagreement patterns
+ if (!specUnan) {
+ const sortedSpecs = [...specs].sort((a, b) => a - b);
+ const key = `[${sortedSpecs.join(",")}]`;
+ specDisagreementPatterns.set(key, (specDisagreementPatterns.get(key) ?? 0) + 1);
+ }
+
+ // Both
+ if (catUnan && specUnan) bothUnanimous++;
+ if (catMaj && specMaj0) bothMajority++;
+ }
+
+ console.log(`\n Unanimity (all 3 agree):`);
+ console.log(` Category: ${catUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catUnanimous, nParagraphs)})`);
+ console.log(` Specificity: ${specUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specUnanimous, nParagraphs)})`);
+ console.log(` Both: ${bothUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothUnanimous, nParagraphs)})`);
+
+ console.log(`\n Majority (≥2 agree):`);
+ console.log(` Category: ${catMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catMajority, nParagraphs)})`);
+ console.log(` Specificity: ${specMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specMajority, nParagraphs)})`);
+ console.log(` Both: ${bothMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothMajority, nParagraphs)})`);
+
+ console.log(`\n No majority (3-way split):`);
+ console.log(` Category: ${catNoMajority.toLocaleString()} (${pct(catNoMajority, nParagraphs)})`);
+ console.log(` Specificity: ${specNoMajority.toLocaleString()} (${pct(specNoMajority, nParagraphs)})`);
+
+ console.log(`\n Specificity spread (MAD):`);
+ console.log(` Mean: ${mean(specSpreads).toFixed(3)}`);
+ console.log(` Median: ${median(specSpreads).toFixed(3)}`);
+ console.log(` Std: ${stddev(specSpreads).toFixed(3)}`);
+
+ // Stage 2 need
+ const needsStage2 = nParagraphs - bothUnanimous;
+ console.log(`\n → Need Stage 2 judge: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`);
+
+ // ════════════════════════════════════════════════════════════════════
+ // 4. DISAGREEMENT BREAKDOWN
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Category Disagreement Pairs (top 20) ──────────────────");
+ const sortedCatDis = [...catDisagreementPairs.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [pair, count] of sortedCatDis.slice(0, 20)) {
+ console.log(` ${count.toLocaleString().padStart(6)} ${pair}`);
+ }
+
+ console.log("\n── Specificity Disagreement Patterns (all) ────────────────");
+ const sortedSpecDis = [...specDisagreementPatterns.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [pattern, count] of sortedSpecDis) {
+ console.log(` ${count.toLocaleString().padStart(6)} ${pattern}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 5. PAIRWISE MODEL AGREEMENT
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Pairwise Model Agreement ───────────────────────────────");
+ for (let i = 0; i < modelNames.length; i++) {
+ for (let j = i + 1; j < modelNames.length; j++) {
+ const m1 = modelNames[i], m2 = modelNames[j];
+ let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
+ for (const [pid, panns] of byParagraph) {
+ const a1 = panns.find(a => a.provenance.modelId === m1);
+ const a2 = panns.find(a => a.provenance.modelId === m2);
+ if (!a1 || !a2) continue;
+ total++;
+ const ca = a1.label.content_category === a2.label.content_category;
+ const sa = a1.label.specificity_level === a2.label.specificity_level;
+ if (ca) catAgree++;
+ if (sa) specAgree++;
+ if (ca && sa) bothAgree++;
+ }
+ console.log(`\n ${shortName(m1)} × ${shortName(m2)} (n=${total.toLocaleString()}):`);
+ console.log(` Category: ${pct(catAgree, total)} (${catAgree.toLocaleString()})`);
+ console.log(` Specificity: ${pct(specAgree, total)} (${specAgree.toLocaleString()})`);
+ console.log(` Both: ${pct(bothAgree, total)} (${bothAgree.toLocaleString()})`);
+ }
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 6. CATEGORY DISTRIBUTION (AGGREGATE)
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Category Distribution (all annotations) ────────────────");
+ const aggCat = new Map();
+ for (const a of anns) {
+ aggCat.set(a.label.content_category, (aggCat.get(a.label.content_category) ?? 0) + 1);
+ }
+ const sortedCats = [...aggCat.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [cat, count] of sortedCats) {
+ console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${cat}`);
+ }
+
+ // Per-model category distribution comparison
+ console.log("\n── Category Distribution by Model (%) ─────────────────────");
+ const categories = sortedCats.map(([c]) => c);
+ const header = "Category".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join("");
+ console.log(` ${header}`);
+ for (const cat of categories) {
+ const row = cat.padEnd(30) + modelNames.map(m => {
+ const mas = byModel.get(m)!;
+ const count = mas.filter(a => a.label.content_category === cat).length;
+ return pct(count, mas.length).padStart(12);
+ }).join("");
+ console.log(` ${row}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 7. SPECIFICITY DISTRIBUTION (AGGREGATE)
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n── Specificity Distribution (all annotations) ──────────────");
+ const specLabels = ["Generic Boilerplate", "Sector-Adapted", "Firm-Specific", "Quantitatively Verifiable"];
+ const aggSpec = new Map();
+ for (const a of anns) {
+ aggSpec.set(a.label.specificity_level, (aggSpec.get(a.label.specificity_level) ?? 0) + 1);
+ }
+ for (let s = 1; s <= 4; s++) {
+ const count = aggSpec.get(s) ?? 0;
+ console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${s} (${specLabels[s - 1]})`);
+ }
+
+ console.log("\n── Specificity Distribution by Model (%) ──────────────────");
+ const specHeader = "Level".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join("");
+ console.log(` ${specHeader}`);
+ for (let s = 1; s <= 4; s++) {
+ const row = `${s} ${specLabels[s - 1]}`.padEnd(30) + modelNames.map(m => {
+ const mas = byModel.get(m)!;
+ const count = mas.filter(a => a.label.specificity_level === s).length;
+ return pct(count, mas.length).padStart(12);
+ }).join("");
+ console.log(` ${row}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 8. CROSS-TABULATION: Category × Specificity
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n── Category × Specificity Cross-tab (unanimous paragraphs only) ─");
+ const crossTab = new Map();
+ let unanimousCount = 0;
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+ const cats = panns.map(a => a.label.content_category);
+ const specs = panns.map(a => a.label.specificity_level);
+ if (new Set(cats).size === 1 && new Set(specs).size === 1) {
+ const key = `${cats[0]}|${specs[0]}`;
+ crossTab.set(key, (crossTab.get(key) ?? 0) + 1);
+ unanimousCount++;
+ }
+ }
+ console.log(` (${unanimousCount.toLocaleString()} paragraphs with both-unanimous)\n`);
+ const ctHeader = "Category".padEnd(30) + [1, 2, 3, 4].map(s => `${s}`.padStart(8)).join("") + " Total".padStart(8);
+ console.log(` ${ctHeader}`);
+ for (const cat of categories) {
+ let rowTotal = 0;
+ const cells = [1, 2, 3, 4].map(s => {
+ const v = crossTab.get(`${cat}|${s}`) ?? 0;
+ rowTotal += v;
+ return `${v}`.padStart(8);
+ }).join("");
+ console.log(` ${cat.padEnd(30)}${cells} ${`${rowTotal}`.padStart(6)}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 9. CONFIDENCE ANALYSIS
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Confidence vs Agreement ─────────────────────────────────");
+ // Check if low-confidence predictions are more likely to disagree
+ const confBuckets: { label: string; filter: (a: Ann) => boolean }[] = [
+ { label: "both high", filter: a => a.label.category_confidence === "high" && a.label.specificity_confidence === "high" },
+ { label: "cat low", filter: a => a.label.category_confidence === "low" },
+ { label: "spec low", filter: a => a.label.specificity_confidence === "low" },
+ { label: "cat medium", filter: a => a.label.category_confidence === "medium" },
+ { label: "spec medium", filter: a => a.label.specificity_confidence === "medium" },
+ ];
+
+ for (const bucket of confBuckets) {
+ // Find paragraphs where at least one model reported this confidence
+ let totalP = 0, catUnanP = 0, specUnanP = 0, bothUnanP = 0;
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+ if (!panns.some(bucket.filter)) continue;
+ totalP++;
+ const cats = panns.map(a => a.label.content_category);
+ const specs = panns.map(a => a.label.specificity_level);
+ if (new Set(cats).size === 1) catUnanP++;
+ if (new Set(specs).size === 1) specUnanP++;
+ if (new Set(cats).size === 1 && new Set(specs).size === 1) bothUnanP++;
+ }
+ if (totalP === 0) continue;
+ console.log(` "${bucket.label}" paragraphs (n=${totalP.toLocaleString()}): cat ${pct(catUnanP, totalP)}, spec ${pct(specUnanP, totalP)}, both ${pct(bothUnanP, totalP)}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 10. OUTLIER MODEL ANALYSIS
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Outlier Analysis (model is the odd one out) ────────────");
+ const outlierCounts = new Map();
+ for (const m of modelNames) {
+ outlierCounts.set(m, { catOutlier: 0, specOutlier: 0, total: 0 });
+ }
+
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+ for (const m of modelNames) {
+ outlierCounts.get(m)!.total++;
+ }
+
+ const cats = panns.map(a => a.label.content_category);
+ const specs = panns.map(a => a.label.specificity_level);
+
+ // Category: if 2 agree and 1 differs, the differing one is outlier
+ if (new Set(cats).size === 2) {
+ for (const a of panns) {
+ const others = panns.filter(o => o !== a);
+ if (others[0].label.content_category === others[1].label.content_category &&
+ a.label.content_category !== others[0].label.content_category) {
+ outlierCounts.get(a.provenance.modelId)!.catOutlier++;
+ }
+ }
+ }
+
+ // Specificity: if 2 agree and 1 differs
+ if (new Set(specs).size === 2) {
+ for (const a of panns) {
+ const others = panns.filter(o => o !== a);
+ if (others[0].label.specificity_level === others[1].label.specificity_level &&
+ a.label.specificity_level !== others[0].label.specificity_level) {
+ outlierCounts.get(a.provenance.modelId)!.specOutlier++;
+ }
+ }
+ }
+ }
+
+ for (const m of modelNames) {
+ const o = outlierCounts.get(m)!;
+ console.log(`\n ${shortName(m)}:`);
+ console.log(` Category outlier: ${o.catOutlier.toLocaleString()} times (${pct(o.catOutlier, o.total)})`);
+ console.log(` Specificity outlier: ${o.specOutlier.toLocaleString()} times (${pct(o.specOutlier, o.total)})`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 11. CATEGORY-SPECIFIC SPECIFICITY AGREEMENT
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Specificity Agreement by Category ──────────────────────");
+ console.log(" (among paragraphs where all 3 models agree on category)\n");
+ const catSpecAgreement = new Map();
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+ const cats = panns.map(a => a.label.content_category);
+ if (new Set(cats).size !== 1) continue;
+ const cat = cats[0];
+ if (!catSpecAgreement.has(cat)) catSpecAgreement.set(cat, { total: 0, specUnan: 0, specMaj: 0 });
+ const entry = catSpecAgreement.get(cat)!;
+ entry.total++;
+ const specs = panns.map(a => a.label.specificity_level);
+ if (new Set(specs).size === 1) entry.specUnan++;
+ if (specs.filter(s => s === specs[0]).length >= 2 || specs.filter(s => s === specs[1]).length >= 2) entry.specMaj++;
+ }
+
+ for (const cat of categories) {
+ const e = catSpecAgreement.get(cat);
+ if (!e) continue;
+ console.log(` ${cat.padEnd(28)} n=${e.total.toLocaleString().padStart(6)} spec-unan: ${pct(e.specUnan, e.total).padStart(6)} spec-maj: ${pct(e.specMaj, e.total).padStart(6)}`);
+ }
+
+ // ════════════════════════════════════════════════════════════════════
+ // 12. CONSENSUS LABELS (majority vote or unanimous)
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Consensus Label Distribution (majority vote) ───────────");
+ const consensusCat = new Map();
+ const consensusSpec = new Map();
+ let noConsensusCat = 0, noConsensusSpec = 0;
+
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+
+ // Category majority
+ const cats = panns.map(a => a.label.content_category);
+ const catFreq = new Map();
+ for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
+ const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
+ if (majCat) {
+ consensusCat.set(majCat, (consensusCat.get(majCat) ?? 0) + 1);
+ } else {
+ noConsensusCat++;
+ }
+
+ // Specificity majority
+ const specs = panns.map(a => a.label.specificity_level);
+ const specFreq = new Map();
+ for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
+ const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
+ if (majSpec !== undefined) {
+ consensusSpec.set(majSpec, (consensusSpec.get(majSpec) ?? 0) + 1);
+ } else {
+ noConsensusSpec++;
+ }
+ }
+
+ console.log("\n Category (majority vote):");
+ const sortedConsCat = [...consensusCat.entries()].sort((a, b) => b[1] - a[1]);
+ for (const [cat, count] of sortedConsCat) {
+ console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${cat}`);
+ }
+ console.log(` ${noConsensusCat.toLocaleString().padStart(8)} ${pct(noConsensusCat, nParagraphs).padStart(6)} [no majority]`);
+
+ console.log("\n Specificity (majority vote):");
+ for (let s = 1; s <= 4; s++) {
+ const count = consensusSpec.get(s) ?? 0;
+ console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${s} (${specLabels[s - 1]})`);
+ }
+ console.log(` ${noConsensusSpec.toLocaleString().padStart(8)} ${pct(noConsensusSpec, nParagraphs).padStart(6)} [no majority]`);
+
+ // ════════════════════════════════════════════════════════════════════
+ // 13. STAGE 2 WORKLOAD ESTIMATE
+ // ════════════════════════════════════════════════════════════════════
+ console.log("\n\n── Stage 2 Workload Estimate ───────────────────────────────");
+ let catOnly = 0, specOnly = 0, bothDisagree = 0;
+ for (const [pid, panns] of byParagraph) {
+ if (panns.length !== 3) continue;
+ const cats = panns.map(a => a.label.content_category);
+ const specs = panns.map(a => a.label.specificity_level);
+ const catU = new Set(cats).size === 1;
+ const specU = new Set(specs).size === 1;
+ if (!catU && specU) catOnly++;
+ if (catU && !specU) specOnly++;
+ if (!catU && !specU) bothDisagree++;
+ }
+ console.log(` Paragraphs needing Stage 2: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`);
+ console.log(` Cat disagree only: ${catOnly.toLocaleString()}`);
+ console.log(` Spec disagree only: ${specOnly.toLocaleString()}`);
+ console.log(` Both disagree: ${bothDisagree.toLocaleString()}`);
+
+ // Estimate cost: stage2 uses sonnet, roughly 3x more expensive per call
+ // Average input tokens from stage1 + annotations context
+ const avgInput = totalInput / anns.length;
+ const stage2InputEst = (avgInput + 500) * needsStage2; // extra for prior annotation context
+ const stage2CostEst = (stage2InputEst / 1e6) * 3.0 + (needsStage2 * 150 / 1e6) * 15.0; // $3/MTok in, $15/MTok out estimate
+ console.log(`\n Estimated Stage 2 cost: ~$${stage2CostEst.toFixed(0)} (rough, Sonnet pricing)`);
+}
+
+main().catch(err => { console.error(err); process.exit(1); });
diff --git a/ts/scripts/stage1-run.ts b/ts/scripts/stage1-run.ts
new file mode 100644
index 0000000..9167008
--- /dev/null
+++ b/ts/scripts/stage1-run.ts
@@ -0,0 +1,158 @@
+/**
+ * Stage 1 production run: annotate all paragraphs with 3 models.
+ *
+ * Features:
+ * - Crash-safe: appends one JSONL line per annotation, resumes on restart
+ * - All 3 models run in parallel per paragraph (not sequentially)
+ * - Real-time progress + cost logging
+ * - Configurable concurrency (total concurrent API calls)
+ *
+ * Usage:
+ * bun ts/scripts/stage1-run.ts [--concurrency 30] [--input ../data/paragraphs/training.jsonl]
+ *
+ * Output:
+ * ../data/annotations/stage1.jsonl — one Annotation per (paragraph, model) pair
+ */
+import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
+import { Paragraph } from "../src/schemas/paragraph.ts";
+import { STAGE1_MODELS } from "../src/lib/openrouter.ts";
+import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
+import { PROMPT_VERSION } from "../src/label/prompts.ts";
+import { v4 as uuidv4 } from "uuid";
+import { mkdir } from "node:fs/promises";
+import { existsSync } from "node:fs";
+import pLimit from "p-limit";
+
+// ── Args ────────────────────────────────────────────────────────────────
+const args = process.argv.slice(2);
+function flag(name: string): string | undefined {
+ const idx = args.indexOf(`--${name}`);
+ return idx === -1 ? undefined : args[idx + 1];
+}
+const CONCURRENCY = parseInt(flag("concurrency") ?? "30", 10);
+const INPUT_PATH = flag("input") ?? new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
+const OUTPUT_DIR = new URL("../../data/annotations", import.meta.url).pathname;
+const OUTPUT_PATH = `${OUTPUT_DIR}/stage1.jsonl`;
+
+// ── Main ────────────────────────────────────────────────────────────────
+async function main() {
+ if (!existsSync(OUTPUT_DIR)) await mkdir(OUTPUT_DIR, { recursive: true });
+
+ // Load training data
+ console.error(`Loading paragraphs from ${INPUT_PATH}...`);
+ const { records: paragraphs, skipped } = await readJsonl(INPUT_PATH, Paragraph);
+ if (skipped > 0) console.error(` ⚠ Skipped ${skipped} invalid lines`);
+ console.error(` Loaded ${paragraphs.length} paragraphs`);
+ console.error(` Models: ${STAGE1_MODELS.join(", ")}`);
+ console.error(` Prompt: ${PROMPT_VERSION}`);
+ console.error(` Concurrency: ${CONCURRENCY}`);
+
+ const totalJobs = paragraphs.length * STAGE1_MODELS.length;
+ console.error(` Total annotations needed: ${totalJobs.toLocaleString()}`);
+
+ // Load existing results for resume
+ const doneKeys = new Set();
+ let resumedCost = 0;
+ if (existsSync(OUTPUT_PATH)) {
+ const { records: existing, skipped: badLines } = await readJsonlRaw(OUTPUT_PATH);
+ for (const rec of existing) {
+ const r = rec as { paragraphId?: string; provenance?: { modelId?: string; costUsd?: number } };
+ if (r.paragraphId && r.provenance?.modelId) {
+ doneKeys.add(`${r.paragraphId}|${r.provenance.modelId}`);
+ resumedCost += r.provenance.costUsd ?? 0;
+ }
+ }
+ if (doneKeys.size > 0) {
+ console.error(` Resuming: ${doneKeys.size.toLocaleString()} annotations already done ($${resumedCost.toFixed(2)}), ${(totalJobs - doneKeys.size).toLocaleString()} remaining`);
+ }
+ if (badLines > 0) console.error(` ⚠ ${badLines} corrupted lines in output (skipped)`);
+ }
+
+ if (doneKeys.size >= totalJobs) {
+ console.error(" ✓ All annotations already complete!");
+ return;
+ }
+
+ // Build job list: (paragraph, model) pairs not yet done
+ type Job = { paragraph: Paragraph; modelId: string };
+ const jobs: Job[] = [];
+ for (const paragraph of paragraphs) {
+ for (const modelId of STAGE1_MODELS) {
+ if (!doneKeys.has(`${paragraph.id}|${modelId}`)) {
+ jobs.push({ paragraph, modelId });
+ }
+ }
+ }
+ console.error(` Jobs to run: ${jobs.length.toLocaleString()}`);
+
+ // Run with concurrency limiter
+ const runId = uuidv4();
+ const limit = pLimit(CONCURRENCY);
+ let completed = doneKeys.size;
+ let failed = 0;
+ let sessionCost = 0;
+ const startTime = Date.now();
+
+ // Progress logging
+ const logInterval = setInterval(() => {
+ const elapsed = (Date.now() - startTime) / 1000;
+ const rate = (completed - doneKeys.size) / elapsed;
+ const remaining = totalJobs - completed;
+ const eta = rate > 0 ? remaining / rate : Infinity;
+ const etaMin = Math.round(eta / 60);
+ process.stderr.write(
+ `\r ${completed.toLocaleString()}/${totalJobs.toLocaleString()} (${((completed / totalJobs) * 100).toFixed(1)}%)` +
+ ` $${(resumedCost + sessionCost).toFixed(2)}` +
+ ` ${rate.toFixed(1)}/s` +
+ ` ETA ${etaMin}m` +
+ ` ${failed} failed `,
+ );
+ }, 2000);
+
+ const tasks = jobs.map((job) =>
+ limit(async () => {
+ const opts: AnnotateOpts = {
+ modelId: job.modelId,
+ stage: "stage1",
+ runId,
+ promptVersion: PROMPT_VERSION,
+ reasoningEffort: "low",
+ };
+
+ try {
+ const ann = await annotateParagraph(job.paragraph, opts);
+ await appendJsonl(OUTPUT_PATH, ann);
+ sessionCost += ann.provenance.costUsd;
+ completed++;
+ } catch (error) {
+ failed++;
+ const msg = error instanceof Error ? error.message : String(error);
+ // Log failures to stderr but don't crash — we can retry on next run
+ console.error(`\n ✖ ${job.modelId} × ${job.paragraph.id}: ${msg}`);
+ }
+ }),
+ );
+
+ await Promise.all(tasks);
+ clearInterval(logInterval);
+
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
+ console.error(
+ `\n\n ═══ COMPLETE ═══` +
+ `\n Annotations: ${completed.toLocaleString()}/${totalJobs.toLocaleString()}` +
+ `\n Failed: ${failed}` +
+ `\n Session cost: $${sessionCost.toFixed(2)}` +
+ `\n Total cost: $${(resumedCost + sessionCost).toFixed(2)}` +
+ `\n Wall time: ${elapsed}s` +
+ `\n Output: ${OUTPUT_PATH}`,
+ );
+
+ if (failed > 0) {
+ console.error(`\n ⚠ ${failed} failures — re-run this script to retry them.`);
+ }
+}
+
+main().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
diff --git a/ts/src/analyze/corpus-stats.ts b/ts/src/analyze/corpus-stats.ts
new file mode 100644
index 0000000..62d6b28
--- /dev/null
+++ b/ts/src/analyze/corpus-stats.ts
@@ -0,0 +1,617 @@
+/**
+ * Corpus statistical analysis for extraction QA and representative sampling.
+ * Produces a comprehensive breakdown saved as JSON + human-readable report.
+ */
+import { readJsonl } from "../lib/jsonl.ts";
+import { Paragraph } from "../schemas/paragraph.ts";
+import type { Paragraph as ParagraphType } from "../schemas/paragraph.ts";
+import { writeFile, mkdir } from "node:fs/promises";
+import { dirname } from "node:path";
+
+const DATA = "../data";
+const PARAGRAPHS_PATH = `${DATA}/paragraphs/paragraphs.jsonl`;
+
+// ─── Statistical helpers ───
+
+function median(sorted: number[]): number {
+ if (sorted.length === 0) return 0;
+ const mid = Math.floor(sorted.length / 2);
+ return sorted.length % 2 === 0 ? (sorted[mid - 1]! + sorted[mid]!) / 2 : sorted[mid]!;
+}
+
+function percentile(sorted: number[], p: number): number {
+ if (sorted.length === 0) return 0;
+ const idx = (p / 100) * (sorted.length - 1);
+ const lower = Math.floor(idx);
+ const upper = Math.ceil(idx);
+ if (lower === upper) return sorted[lower]!;
+ return sorted[lower]! + (sorted[upper]! - sorted[lower]!) * (idx - lower);
+}
+
+function mean(arr: number[]): number {
+ return arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
+}
+
+function stddev(arr: number[]): number {
+ if (arr.length < 2) return 0;
+ const m = mean(arr);
+ return Math.sqrt(arr.reduce((sum, v) => sum + (v - m) ** 2, 0) / (arr.length - 1));
+}
+
+function skewness(arr: number[]): number {
+ if (arr.length < 3) return 0;
+ const m = mean(arr);
+ const s = stddev(arr);
+ if (s === 0) return 0;
+ const n = arr.length;
+ return (n / ((n - 1) * (n - 2))) * arr.reduce((sum, v) => sum + ((v - m) / s) ** 3, 0);
+}
+
+function kurtosis(arr: number[]): number {
+ if (arr.length < 4) return 0;
+ const m = mean(arr);
+ const s = stddev(arr);
+ if (s === 0) return 0;
+ const n = arr.length;
+ const k4 = arr.reduce((sum, v) => sum + ((v - m) / s) ** 4, 0) / n;
+ return k4 - 3; // excess kurtosis
+}
+
+interface DistributionStats {
+ count: number;
+ mean: number;
+ stddev: number;
+ min: number;
+ p5: number;
+ p25: number;
+ median: number;
+ p75: number;
+ p95: number;
+ max: number;
+ skewness: number;
+ kurtosis: number;
+}
+
+function distributionStats(values: number[]): DistributionStats {
+ const sorted = [...values].sort((a, b) => a - b);
+ return {
+ count: values.length,
+ mean: round(mean(values)),
+ stddev: round(stddev(values)),
+ min: sorted[0] ?? 0,
+ p5: round(percentile(sorted, 5)),
+ p25: round(percentile(sorted, 25)),
+ median: round(median(sorted)),
+ p75: round(percentile(sorted, 75)),
+ p95: round(percentile(sorted, 95)),
+ max: sorted[sorted.length - 1] ?? 0,
+ skewness: round(skewness(values)),
+ kurtosis: round(kurtosis(values)),
+ };
+}
+
+function round(n: number, decimals = 2): number {
+ return Math.round(n * 10 ** decimals) / 10 ** decimals;
+}
+
+// ─── Histogram builder ───
+
+function histogram(values: number[], binCount: number): Array<{ binStart: number; binEnd: number; count: number; pct: number }> {
+ if (values.length === 0) return [];
+ const sorted = [...values].sort((a, b) => a - b);
+ const min = sorted[0]!;
+ const max = sorted[sorted.length - 1]!;
+ const binWidth = Math.ceil((max - min) / binCount) || 1;
+
+ const bins: Array<{ binStart: number; binEnd: number; count: number; pct: number }> = [];
+ for (let i = 0; i < binCount; i++) {
+ const binStart = min + i * binWidth;
+ const binEnd = binStart + binWidth;
+ const count = sorted.filter((v) => v >= binStart && (i === binCount - 1 ? v <= binEnd : v < binEnd)).length;
+ bins.push({ binStart, binEnd, count, pct: round((count / values.length) * 100) });
+ }
+ return bins;
+}
+
+// ─── Analysis functions ───
+
+interface CompanyStats {
+ cik: string;
+ name: string;
+ ticker: string;
+ filingType: string;
+ filingDate: string;
+ paragraphCount: number;
+ totalWords: number;
+ avgWordCount: number;
+ medianWordCount: number;
+}
+
+function analyzeByCompany(paragraphs: ParagraphType[]): CompanyStats[] {
+ const byAccession = new Map();
+ for (const p of paragraphs) {
+ const key = p.filing.accessionNumber;
+ const existing = byAccession.get(key) ?? [];
+ existing.push(p);
+ byAccession.set(key, existing);
+ }
+
+ const stats: CompanyStats[] = [];
+ for (const [, paras] of byAccession) {
+ const first = paras[0]!;
+ const wcs = paras.map((p) => p.wordCount);
+ const sorted = [...wcs].sort((a, b) => a - b);
+ stats.push({
+ cik: first.filing.cik,
+ name: first.filing.companyName,
+ ticker: first.filing.ticker,
+ filingType: first.filing.filingType,
+ filingDate: first.filing.filingDate,
+ paragraphCount: paras.length,
+ totalWords: wcs.reduce((a, b) => a + b, 0),
+ avgWordCount: round(mean(wcs)),
+ medianWordCount: median(sorted),
+ });
+ }
+
+ return stats.sort((a, b) => b.paragraphCount - a.paragraphCount);
+}
+
+function analyzeWordCountDistribution(paragraphs: ParagraphType[]) {
+ const wcs = paragraphs.map((p) => p.wordCount);
+ return {
+ overall: distributionStats(wcs),
+ histogram: histogram(wcs, 20),
+ buckets: {
+ "20-50": wcs.filter((w) => w >= 20 && w < 50).length,
+ "50-100": wcs.filter((w) => w >= 50 && w < 100).length,
+ "100-200": wcs.filter((w) => w >= 100 && w < 200).length,
+ "200-300": wcs.filter((w) => w >= 200 && w < 300).length,
+ "300-400": wcs.filter((w) => w >= 300 && w < 400).length,
+ "400-500": wcs.filter((w) => w >= 400 && w <= 500).length,
+ "500+": wcs.filter((w) => w > 500).length,
+ },
+ };
+}
+
+function analyzeFilingTypeBreakdown(paragraphs: ParagraphType[]) {
+ const byType = new Map();
+ for (const p of paragraphs) {
+ const key = p.filing.filingType;
+ const existing = byType.get(key) ?? [];
+ existing.push(p);
+ byType.set(key, existing);
+ }
+
+ const result: Record = {};
+ for (const [type, paras] of byType) {
+ const companies = new Set(paras.map((p) => p.filing.cik));
+ result[type] = {
+ paragraphs: paras.length,
+ companies: companies.size,
+ wordCountDist: distributionStats(paras.map((p) => p.wordCount)),
+ };
+ }
+ return result;
+}
+
+function analyzeTemporalDistribution(paragraphs: ParagraphType[]) {
+ // By filing date (month)
+ const byMonth = new Map();
+ const byYear = new Map();
+ for (const p of paragraphs) {
+ const month = p.filing.filingDate.slice(0, 7); // YYYY-MM
+ byMonth.set(month, (byMonth.get(month) ?? 0) + 1);
+ byYear.set(p.filing.fiscalYear, (byYear.get(p.filing.fiscalYear) ?? 0) + 1);
+ }
+
+ return {
+ byFilingMonth: Object.fromEntries([...byMonth.entries()].sort()),
+ byFiscalYear: Object.fromEntries([...byYear.entries()].sort()),
+ };
+}
+
+function analyzeSecItemDistribution(paragraphs: ParagraphType[]) {
+ const byItem = new Map();
+ for (const p of paragraphs) {
+ const item = p.filing.secItem;
+ byItem.set(item, (byItem.get(item) ?? 0) + 1);
+ }
+ return Object.fromEntries([...byItem.entries()].sort());
+}
+
+function analyzeParagraphPosition(paragraphs: ParagraphType[]) {
+ // How paragraph index relates to content — early vs late in section
+ const indices = paragraphs.map((p) => p.paragraphIndex);
+ return {
+ indexDistribution: distributionStats(indices),
+ // Paragraphs per position bucket
+ positionBuckets: {
+ "0-2 (opening)": indices.filter((i) => i <= 2).length,
+ "3-5 (early)": indices.filter((i) => i >= 3 && i <= 5).length,
+ "6-10 (middle)": indices.filter((i) => i >= 6 && i <= 10).length,
+ "11-20 (late)": indices.filter((i) => i >= 11 && i <= 20).length,
+ "21+ (deep)": indices.filter((i) => i >= 21).length,
+ },
+ };
+}
+
+function analyzeTextLengthByPosition(paragraphs: ParagraphType[]) {
+ // Do paragraphs get shorter/longer deeper into the section?
+ const byBucket: Record = {
+ "0-2": [],
+ "3-5": [],
+ "6-10": [],
+ "11-20": [],
+ "21+": [],
+ };
+ for (const p of paragraphs) {
+ const idx = p.paragraphIndex;
+ if (idx <= 2) byBucket["0-2"]!.push(p.wordCount);
+ else if (idx <= 5) byBucket["3-5"]!.push(p.wordCount);
+ else if (idx <= 10) byBucket["6-10"]!.push(p.wordCount);
+ else if (idx <= 20) byBucket["11-20"]!.push(p.wordCount);
+ else byBucket["21+"]!.push(p.wordCount);
+ }
+
+ const result: Record = {};
+ for (const [bucket, wcs] of Object.entries(byBucket)) {
+ if (wcs.length > 0) result[bucket] = distributionStats(wcs);
+ }
+ return result;
+}
+
+function analyzeCompanySizeDistribution(companyStats: CompanyStats[]) {
+ const paraCounts = companyStats.map((c) => c.paragraphCount);
+ const totalWords = companyStats.map((c) => c.totalWords);
+
+ return {
+ paragraphsPerFiling: distributionStats(paraCounts),
+ wordsPerFiling: distributionStats(totalWords),
+ paragraphHistogram: histogram(paraCounts, 15),
+ sizeCategories: {
+ "tiny (1-3 paras)": paraCounts.filter((c) => c >= 1 && c <= 3).length,
+ "small (4-8 paras)": paraCounts.filter((c) => c >= 4 && c <= 8).length,
+ "medium (9-15 paras)": paraCounts.filter((c) => c >= 9 && c <= 15).length,
+ "large (16-30 paras)": paraCounts.filter((c) => c >= 16 && c <= 30).length,
+ "very large (31+ paras)": paraCounts.filter((c) => c >= 31).length,
+ },
+ };
+}
+
+function analyzeTickerCoverage(paragraphs: ParagraphType[]) {
+ const withTicker = new Set();
+ const withoutTicker = new Set();
+ for (const p of paragraphs) {
+ if (p.filing.ticker) withTicker.add(p.filing.cik);
+ else withoutTicker.add(p.filing.cik);
+ }
+ return {
+ companiesWithTicker: withTicker.size,
+ companiesWithoutTicker: withoutTicker.size,
+ tickerCoveragePct: round((withTicker.size / (withTicker.size + withoutTicker.size)) * 100),
+ };
+}
+
+function detectOutliers(companyStats: CompanyStats[]) {
+ const paraCounts = companyStats.map((c) => c.paragraphCount);
+ const sorted = [...paraCounts].sort((a, b) => a - b);
+ const q1 = percentile(sorted, 25);
+ const q3 = percentile(sorted, 75);
+ const iqr = q3 - q1;
+ const upperFence = q3 + 1.5 * iqr;
+ const lowerFence = q1 - 1.5 * iqr;
+
+ return {
+ q1,
+ q3,
+ iqr: round(iqr),
+ upperFence: round(upperFence),
+ lowerFence: round(lowerFence),
+ outlierFilings: companyStats
+ .filter((c) => c.paragraphCount > upperFence || c.paragraphCount < lowerFence)
+ .map((c) => ({
+ name: c.name,
+ ticker: c.ticker,
+ paragraphCount: c.paragraphCount,
+ totalWords: c.totalWords,
+ direction: c.paragraphCount > upperFence ? "high" : "low",
+ })),
+ };
+}
+
+// ─── Text content heuristics ───
+
+function analyzeContentSignals(paragraphs: ParagraphType[]) {
+ // Simple keyword presence analysis to estimate content distribution
+ const signals = {
+ boardGovernance: 0,
+ managementRole: 0,
+ riskProcess: 0,
+ thirdParty: 0,
+ incidentDisclosure: 0,
+ strategyIntegration: 0,
+ boilerplate: 0,
+ };
+
+ const patterns = {
+ boardGovernance: /\b(board\s+of\s+directors|board\s+oversight|audit\s+committee|board\s+member|director|board\s+level|governance\s+committee)\b/i,
+ managementRole: /\b(chief\s+information\s+security\s+officer|CISO|chief\s+technology\s+officer|CTO|chief\s+information\s+officer|CIO|management\s+team|security\s+team|reports?\s+to\s+the)\b/i,
+ riskProcess: /\b(risk\s+assessment|risk\s+management|vulnerability\s+(assessment|scanning)|penetration\s+test|threat\s+intelligence|risk\s+framework|NIST|ISO\s*27001|identify.*assess.*mitigate)\b/i,
+ thirdParty: /\b(third[- ]party|vendor|supplier|service\s+provider|outsourc|contractor|supply\s+chain|due\s+diligence)\b/i,
+ incidentDisclosure: /\b(incident|breach|unauthorized\s+access|data\s+loss|ransomware|cyber\s*attack|compromise[d]?|disruption|material\s+impact)\b/i,
+ strategyIntegration: /\b(strateg(y|ic)|business\s+objective|integrated|enterprise[- ]wide|competitive\s+advantage|digital\s+transformation|investment\s+in\s+cyber)\b/i,
+ boilerplate: /\b(no\s+assurance|cannot\s+guarantee|may\s+not\s+be\s+successful|there\s+can\s+be\s+no|subject\s+to\s+risks|could\s+have\s+a\s+material|may\s+adversely\s+affect)\b/i,
+ };
+
+ for (const p of paragraphs) {
+ for (const [key, pattern] of Object.entries(patterns)) {
+ if (pattern.test(p.text)) {
+ signals[key as keyof typeof signals]++;
+ }
+ }
+ }
+
+ // Also compute overlap — paragraphs matching multiple signals
+ let multiSignal = 0;
+ let noSignal = 0;
+ for (const p of paragraphs) {
+ let matches = 0;
+ for (const pattern of Object.values(patterns)) {
+ if (pattern.test(p.text)) matches++;
+ }
+ if (matches > 1) multiSignal++;
+ if (matches === 0) noSignal++;
+ }
+
+ return {
+ signalCounts: signals,
+ signalPcts: Object.fromEntries(
+ Object.entries(signals).map(([k, v]) => [k, round((v / paragraphs.length) * 100)]),
+ ),
+ multiSignalParagraphs: multiSignal,
+ noSignalParagraphs: noSignal,
+ multiSignalPct: round((multiSignal / paragraphs.length) * 100),
+ noSignalPct: round((noSignal / paragraphs.length) * 100),
+ };
+}
+
+// ─── Report generation ───
+
+function generateReport(analysis: Record): string {
+ const lines: string[] = [];
+ const hr = "═".repeat(72);
+ const sr = "─".repeat(72);
+
+ lines.push(hr);
+ lines.push(" SEC-cyBERT Corpus Statistical Analysis");
+ lines.push(` Generated: ${new Date().toISOString()}`);
+ lines.push(` Data: ${PARAGRAPHS_PATH}`);
+ lines.push(hr);
+ lines.push("");
+
+ // Overview
+ const overview = analysis["overview"] as Record;
+ lines.push("1. OVERVIEW");
+ lines.push(sr);
+ lines.push(` Total paragraphs: ${overview["totalParagraphs"]}`);
+ lines.push(` Total filings: ${overview["totalFilings"]}`);
+ lines.push(` Unique companies: ${overview["uniqueCompanies"]}`);
+ lines.push(` Total words: ${overview["totalWords"]?.toLocaleString()}`);
+ lines.push(` Avg words/paragraph: ${overview["avgWordsPerParagraph"]}`);
+ lines.push(` Avg paras/filing: ${overview["avgParagraphsPerFiling"]}`);
+ lines.push("");
+
+ // Filing type breakdown
+ const filingTypes = analysis["filingTypeBreakdown"] as Record;
+ lines.push("2. FILING TYPE BREAKDOWN");
+ lines.push(sr);
+ for (const [type, data] of Object.entries(filingTypes)) {
+ lines.push(` ${type}: ${data.paragraphs} paragraphs from ${data.companies} companies`);
+ lines.push(` Word count: mean=${data.wordCountDist.mean}, median=${data.wordCountDist.median}, std=${data.wordCountDist.stddev}`);
+ }
+ lines.push("");
+
+ // Word count distribution
+ const wcDist = analysis["wordCountDistribution"] as { overall: DistributionStats; buckets: Record };
+ lines.push("3. WORD COUNT DISTRIBUTION");
+ lines.push(sr);
+ const ov = wcDist.overall;
+ lines.push(` Mean: ${ov.mean} (std: ${ov.stddev})`);
+ lines.push(` Median: ${ov.median}`);
+ lines.push(` Range: [${ov.min}, ${ov.max}]`);
+ lines.push(` IQR: [${ov.p25}, ${ov.p75}]`);
+ lines.push(` Skewness: ${ov.skewness} (${ov.skewness > 0.5 ? "right-skewed" : ov.skewness < -0.5 ? "left-skewed" : "roughly symmetric"})`);
+ lines.push(` Kurtosis: ${ov.kurtosis} (excess; ${ov.kurtosis > 1 ? "heavy-tailed" : ov.kurtosis < -1 ? "light-tailed" : "near-normal"})`);
+ lines.push("");
+ lines.push(" Buckets:");
+ for (const [bucket, count] of Object.entries(wcDist.buckets)) {
+ const pct = round((count / overview["totalParagraphs"]!) * 100);
+ const bar = "█".repeat(Math.round(pct));
+ lines.push(` ${bucket.padEnd(12)} ${String(count).padStart(5)} (${String(pct).padStart(5)}%) ${bar}`);
+ }
+ lines.push("");
+
+ // Company size distribution
+ const compSize = analysis["companySizeDistribution"] as { paragraphsPerFiling: DistributionStats; wordsPerFiling: DistributionStats; sizeCategories: Record };
+ lines.push("4. FILING SIZE DISTRIBUTION (paragraphs per filing)");
+ lines.push(sr);
+ const pf = compSize.paragraphsPerFiling;
+ lines.push(` Mean: ${pf.mean} (std: ${pf.stddev})`);
+ lines.push(` Median: ${pf.median}`);
+ lines.push(` Range: [${pf.min}, ${pf.max}]`);
+ lines.push("");
+ lines.push(" Size categories:");
+ const totalFilings = Object.values(compSize.sizeCategories).reduce((a, b) => a + b, 0);
+ for (const [cat, count] of Object.entries(compSize.sizeCategories)) {
+ const pct = round((count / totalFilings) * 100);
+ const bar = "█".repeat(Math.round(pct / 2));
+ lines.push(` ${cat.padEnd(25)} ${String(count).padStart(4)} (${String(pct).padStart(5)}%) ${bar}`);
+ }
+ lines.push("");
+
+ // Temporal
+ const temporal = analysis["temporalDistribution"] as { byFilingMonth: Record; byFiscalYear: Record };
+ lines.push("5. TEMPORAL DISTRIBUTION");
+ lines.push(sr);
+ lines.push(" By filing month:");
+ for (const [month, count] of Object.entries(temporal.byFilingMonth)) {
+ const bar = "█".repeat(Math.round(count / 20));
+ lines.push(` ${month} ${String(count).padStart(5)} ${bar}`);
+ }
+ lines.push("");
+ lines.push(" By fiscal year:");
+ for (const [year, count] of Object.entries(temporal.byFiscalYear)) {
+ lines.push(` ${year} ${count}`);
+ }
+ lines.push("");
+
+ // Paragraph position
+ const position = analysis["paragraphPosition"] as { positionBuckets: Record };
+ lines.push("6. PARAGRAPH POSITION IN SECTION");
+ lines.push(sr);
+ for (const [pos, count] of Object.entries(position.positionBuckets)) {
+ const pct = round((count / overview["totalParagraphs"]!) * 100);
+ lines.push(` ${pos.padEnd(20)} ${String(count).padStart(5)} (${String(pct).padStart(5)}%)`);
+ }
+ lines.push("");
+
+ // Content signals
+ const content = analysis["contentSignals"] as { signalCounts: Record; signalPcts: Record; multiSignalParagraphs: number; noSignalParagraphs: number; multiSignalPct: number; noSignalPct: number };
+ lines.push("7. HEURISTIC CONTENT SIGNAL ANALYSIS");
+ lines.push(sr);
+ lines.push(" (Keyword-based estimates — NOT ground truth labels)");
+ lines.push("");
+ for (const [signal, count] of Object.entries(content.signalCounts)) {
+ const pct = content.signalPcts[signal]!;
+ const bar = "█".repeat(Math.round(pct / 2));
+ lines.push(` ${signal.padEnd(22)} ${String(count).padStart(5)} (${String(pct).padStart(5)}%) ${bar}`);
+ }
+ lines.push("");
+ lines.push(` Multi-signal paragraphs: ${content.multiSignalParagraphs} (${content.multiSignalPct}%)`);
+ lines.push(` No-signal paragraphs: ${content.noSignalParagraphs} (${content.noSignalPct}%)`);
+ lines.push("");
+
+ // Outliers
+ const outliers = analysis["outliers"] as { upperFence: number; lowerFence: number; iqr: number; outlierFilings: Array<{ name: string; ticker: string; paragraphCount: number; totalWords: number; direction: string }> };
+ lines.push("8. OUTLIER FILINGS (IQR method)");
+ lines.push(sr);
+ lines.push(` IQR: ${outliers.iqr}, Upper fence: ${outliers.upperFence}, Lower fence: ${outliers.lowerFence}`);
+ lines.push(` ${outliers.outlierFilings.length} outlier filings:`);
+ for (const o of outliers.outlierFilings.slice(0, 20)) {
+ lines.push(` ${o.direction.toUpperCase().padEnd(5)} ${(o.ticker || o.name).padEnd(30)} ${o.paragraphCount} paras, ${o.totalWords} words`);
+ }
+ lines.push("");
+
+ // Ticker coverage
+ const ticker = analysis["tickerCoverage"] as { companiesWithTicker: number; companiesWithoutTicker: number; tickerCoveragePct: number };
+ lines.push("9. TICKER COVERAGE");
+ lines.push(sr);
+ lines.push(` With ticker: ${ticker.companiesWithTicker} companies (${ticker.tickerCoveragePct}%)`);
+ lines.push(` Without ticker: ${ticker.companiesWithoutTicker} companies`);
+ lines.push("");
+
+ // Top 20 companies by paragraph count
+ const companies = analysis["companyStats"] as CompanyStats[];
+ lines.push("10. TOP 20 COMPANIES BY PARAGRAPH COUNT");
+ lines.push(sr);
+ lines.push(` ${"Company".padEnd(40)} ${"Ticker".padEnd(8)} ${"Type".padEnd(5)} ${"Paras".padStart(6)} ${"Words".padStart(7)} ${"Avg WC".padStart(7)}`);
+ for (const c of companies.slice(0, 20)) {
+ lines.push(
+ ` ${c.name.slice(0, 39).padEnd(40)} ${(c.ticker || "-").padEnd(8)} ${c.filingType.padEnd(5)} ${String(c.paragraphCount).padStart(6)} ${String(c.totalWords).padStart(7)} ${String(c.avgWordCount).padStart(7)}`,
+ );
+ }
+ lines.push("");
+
+ // Bottom 10
+ lines.push("11. BOTTOM 10 COMPANIES BY PARAGRAPH COUNT");
+ lines.push(sr);
+ for (const c of companies.slice(-10).reverse()) {
+ lines.push(
+ ` ${c.name.slice(0, 39).padEnd(40)} ${(c.ticker || "-").padEnd(8)} ${c.filingType.padEnd(5)} ${String(c.paragraphCount).padStart(6)} ${String(c.totalWords).padStart(7)}`,
+ );
+ }
+ lines.push("");
+
+ // Sampling recommendations
+ lines.push("12. SAMPLING RECOMMENDATIONS");
+ lines.push(sr);
+ const totalParas = overview["totalParagraphs"]!;
+ lines.push(` For a representative sample of ~200 paragraphs (${round((200 / totalParas) * 100)}% of corpus):`);
+ lines.push(" Stratify by:");
+ lines.push(" - Filing type (10-K vs 8-K, proportional)");
+ lines.push(" - Paragraph position (opening/middle/late)");
+ lines.push(" - Word count quartile (short/medium/long/very long)");
+ lines.push(" - Content signal diversity (ensure all categories represented)");
+ lines.push(" - Company size (small/medium/large filings)");
+ lines.push("");
+ lines.push(" Ensure minimum 5 paragraphs per stratum for statistical validity.");
+ lines.push("");
+
+ lines.push(hr);
+ lines.push(" END OF ANALYSIS");
+ lines.push(hr);
+
+ return lines.join("\n");
+}
+
+// ─── Main ───
+
+async function main() {
+ process.stderr.write(" Loading paragraphs...\n");
+ const { records: paragraphs, skipped } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
+ if (skipped > 0) process.stderr.write(` Warning: ${skipped} invalid lines skipped\n`);
+ process.stderr.write(` Loaded ${paragraphs.length} paragraphs\n\n`);
+
+ process.stderr.write(" Computing statistics...\n");
+
+ const companyStats = analyzeByCompany(paragraphs);
+ const uniqueCompanies = new Set(paragraphs.map((p) => p.filing.cik)).size;
+ const uniqueFilings = new Set(paragraphs.map((p) => p.filing.accessionNumber)).size;
+ const totalWords = paragraphs.reduce((sum, p) => sum + p.wordCount, 0);
+
+ const analysis = {
+ generatedAt: new Date().toISOString(),
+ dataPath: PARAGRAPHS_PATH,
+ overview: {
+ totalParagraphs: paragraphs.length,
+ totalFilings: uniqueFilings,
+ uniqueCompanies,
+ totalWords,
+ avgWordsPerParagraph: round(totalWords / paragraphs.length),
+ avgParagraphsPerFiling: round(paragraphs.length / uniqueFilings),
+ },
+ filingTypeBreakdown: analyzeFilingTypeBreakdown(paragraphs),
+ wordCountDistribution: analyzeWordCountDistribution(paragraphs),
+ companySizeDistribution: analyzeCompanySizeDistribution(companyStats),
+ temporalDistribution: analyzeTemporalDistribution(paragraphs),
+ paragraphPosition: analyzeParagraphPosition(paragraphs),
+ textLengthByPosition: analyzeTextLengthByPosition(paragraphs),
+ contentSignals: analyzeContentSignals(paragraphs),
+ outliers: detectOutliers(companyStats),
+ tickerCoverage: analyzeTickerCoverage(paragraphs),
+ secItemDistribution: analyzeSecItemDistribution(paragraphs),
+ companyStats,
+ };
+
+ // Save JSON
+ const jsonPath = `${DATA}/analysis/corpus-stats.json`;
+ const jsonDir = dirname(jsonPath);
+ await mkdir(jsonDir, { recursive: true });
+ await writeFile(jsonPath, JSON.stringify(analysis, null, 2));
+ process.stderr.write(` Saved JSON: ${jsonPath}\n`);
+
+ // Save report
+ const report = generateReport(analysis);
+ const reportPath = `${DATA}/analysis/corpus-stats.txt`;
+ await writeFile(reportPath, report);
+ process.stderr.write(` Saved report: ${reportPath}\n`);
+
+ // Print report to stdout
+ console.log(report);
+}
+
+main().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
diff --git a/ts/src/analyze/data-quality.ts b/ts/src/analyze/data-quality.ts
new file mode 100644
index 0000000..cba193a
--- /dev/null
+++ b/ts/src/analyze/data-quality.ts
@@ -0,0 +1,450 @@
+/**
+ * Comprehensive data quality analysis and cleaning.
+ * Produces a cleaned dataset with dedup metadata, quality flags, and audit trail.
+ *
+ * Outputs:
+ * - data/paragraphs/paragraphs-clean.jsonl (cleaned dataset with dedup metadata)
+ * - data/analysis/quality-report.txt (human-readable audit report)
+ * - data/analysis/quality-report.json (machine-readable)
+ */
+import { readJsonl, writeJsonl } from "../lib/jsonl.ts";
+import { Paragraph } from "../schemas/paragraph.ts";
+import type { Paragraph as ParagraphType } from "../schemas/paragraph.ts";
+import { writeFile, mkdir } from "node:fs/promises";
+import { dirname } from "node:path";
+
+const DATA = "../data";
+const INPUT_PATH = `${DATA}/paragraphs/paragraphs.jsonl`;
+const CLEAN_PATH = `${DATA}/paragraphs/paragraphs-clean.jsonl`;
+
+function round(n: number, d = 2): number {
+ return Math.round(n * 10 ** d) / 10 ** d;
+}
+
+// ─── Quality checks ───
+
+interface QualityFlag {
+ flag: string;
+ severity: "error" | "warning" | "info";
+ description: string;
+}
+
+function checkParagraphQuality(p: ParagraphType): QualityFlag[] {
+ const flags: QualityFlag[] = [];
+
+ // Word count sanity
+ if (p.wordCount < 20) {
+ flags.push({ flag: "too_short", severity: "error", description: `Only ${p.wordCount} words` });
+ }
+ if (p.wordCount > 450) {
+ flags.push({ flag: "very_long", severity: "warning", description: `${p.wordCount} words — may be a parsing artifact` });
+ }
+
+ // Content quality heuristics
+ const text = p.text.toLowerCase();
+
+ // Table data (lots of numbers, pipe chars, or very short repeated patterns)
+ const numberRatio = (text.match(/\d/g)?.length ?? 0) / text.length;
+ if (numberRatio > 0.3) {
+ flags.push({ flag: "high_numeric", severity: "warning", description: `${round(numberRatio * 100)}% numeric content — likely table/financial data` });
+ }
+
+ // Very short sentences (typical of bullet-point formatting artifacts)
+ const avgWordPerSentence = p.wordCount / Math.max(1, (text.match(/[.!?]/g)?.length ?? 1));
+ if (avgWordPerSentence < 8 && p.wordCount > 30) {
+ flags.push({ flag: "fragmented", severity: "info", description: `Avg ${round(avgWordPerSentence)} words/sentence — may be bullet list` });
+ }
+
+ // Not cybersecurity-related content (leaked from other sections)
+ const cyberKeywords = /cybersecurity|cyber|information security|data breach|incident|threat|vulnerability|CISO|penetration test|access control|encryption|firewall|malware|phishing|ransomware/i;
+ if (!cyberKeywords.test(p.text)) {
+ // Check for governance/risk terms that are still relevant
+ const governanceKeywords = /board|committee|officer|risk management|compliance|audit|oversight|govern/i;
+ if (!governanceKeywords.test(p.text)) {
+ flags.push({ flag: "off_topic", severity: "warning", description: "No cybersecurity or governance keywords — may be from wrong section" });
+ }
+ }
+
+ // Legal boilerplate detection
+ const legalBoilerplate = /no\s+assurance|cannot\s+guarantee|forward[- ]looking\s+statements?|safe\s+harbor|cautionary\s+statement/i;
+ if (legalBoilerplate.test(p.text)) {
+ flags.push({ flag: "legal_boilerplate", severity: "info", description: "Contains forward-looking/legal disclaimer language" });
+ }
+
+ // Page header/footer artifacts
+ if (/^(table of contents|page \d|^\d+$)/i.test(p.text.trim())) {
+ flags.push({ flag: "page_artifact", severity: "error", description: "Appears to be a page header/footer" });
+ }
+
+ return flags;
+}
+
+// ─── Dedup metadata ───
+
+interface DedupInfo {
+ textHash: string;
+ duplicateCount: number; // total instances of this hash
+ uniqueCompanies: number; // how many different companies have this text
+ uniqueFilings: number; // how many different filings
+ fiscalYears: number[]; // which fiscal years
+ dedupCategory: "unique" | "cross_company_boilerplate" | "cross_year_copy" | "within_filing_dup";
+ isCanonical: boolean; // is this the "representative" instance for its hash group?
+}
+
+function computeDedupInfo(paragraphs: ParagraphType[]): Map {
+ // Group by textHash
+ const groups = new Map();
+ for (const p of paragraphs) {
+ const existing = groups.get(p.textHash) ?? [];
+ existing.push(p);
+ groups.set(p.textHash, existing);
+ }
+
+ // For each paragraph, compute its dedup info
+ const dedupMap = new Map();
+
+ for (const [hash, paras] of groups) {
+ const companies = new Set(paras.map((p) => p.filing.cik));
+ const filings = new Set(paras.map((p) => p.filing.accessionNumber));
+ const fiscalYears = [...new Set(paras.map((p) => p.filing.fiscalYear))].sort();
+
+ let category: DedupInfo["dedupCategory"];
+ if (paras.length === 1) {
+ category = "unique";
+ } else if (companies.size > 1) {
+ category = "cross_company_boilerplate";
+ } else if (fiscalYears.length > 1) {
+ category = "cross_year_copy";
+ } else {
+ category = "within_filing_dup";
+ }
+
+ // First occurrence is canonical
+ const infos: DedupInfo[] = paras.map((_, idx) => ({
+ textHash: hash,
+ duplicateCount: paras.length,
+ uniqueCompanies: companies.size,
+ uniqueFilings: filings.size,
+ fiscalYears,
+ dedupCategory: category,
+ isCanonical: idx === 0,
+ }));
+
+ dedupMap.set(hash, infos);
+ }
+
+ return dedupMap;
+}
+
+// ─── Filing-level quality ───
+
+interface FilingQuality {
+ accessionNumber: string;
+ companyName: string;
+ paragraphCount: number;
+ totalWords: number;
+ flags: QualityFlag[];
+}
+
+function analyzeFilingQuality(paragraphs: ParagraphType[]): Map {
+ const byFiling = new Map();
+ for (const p of paragraphs) {
+ const key = p.filing.accessionNumber;
+ const existing = byFiling.get(key) ?? [];
+ existing.push(p);
+ byFiling.set(key, existing);
+ }
+
+ const results = new Map();
+ for (const [accession, paras] of byFiling) {
+ const flags: QualityFlag[] = [];
+ const totalWords = paras.reduce((sum, p) => sum + p.wordCount, 0);
+
+ if (paras.length > 50) {
+ flags.push({ flag: "excessive_paragraphs", severity: "error", description: `${paras.length} paragraphs — likely parser over-extraction` });
+ } else if (paras.length > 35) {
+ flags.push({ flag: "high_paragraph_count", severity: "warning", description: `${paras.length} paragraphs — unusually long section` });
+ }
+
+ if (paras.length === 1 && totalWords < 50) {
+ flags.push({ flag: "minimal_disclosure", severity: "info", description: "Single short paragraph — minimal compliance" });
+ }
+
+ // Check for off-topic drift (>30% of paragraphs flagged off-topic)
+ const offTopicCount = paras.filter((p) => {
+ const pFlags = checkParagraphQuality(p);
+ return pFlags.some((f) => f.flag === "off_topic");
+ }).length;
+ if (offTopicCount > paras.length * 0.3 && paras.length > 5) {
+ flags.push({ flag: "content_drift", severity: "warning", description: `${offTopicCount}/${paras.length} paragraphs appear off-topic` });
+ }
+
+ results.set(accession, {
+ accessionNumber: accession,
+ companyName: paras[0]!.filing.companyName,
+ paragraphCount: paras.length,
+ totalWords,
+ flags,
+ });
+ }
+
+ return results;
+}
+
+// ─── Report generation ───
+
+function generateReport(
+ paragraphs: ParagraphType[],
+ dedupMap: Map,
+ filingQuality: Map,
+ paragraphFlags: Map,
+ cleanCount: number,
+ removedCount: number,
+): string {
+ const lines: string[] = [];
+ const hr = "═".repeat(72);
+ const sr = "─".repeat(72);
+
+ lines.push(hr);
+ lines.push(" SEC-cyBERT Data Quality Report");
+ lines.push(` Generated: ${new Date().toISOString()}`);
+ lines.push(hr);
+ lines.push("");
+
+ // 1. Overview
+ lines.push("1. DATASET OVERVIEW");
+ lines.push(sr);
+ lines.push(` Input paragraphs: ${paragraphs.length.toLocaleString()}`);
+ lines.push(` Clean paragraphs: ${cleanCount.toLocaleString()}`);
+ lines.push(` Removed: ${removedCount.toLocaleString()} (${round((removedCount / paragraphs.length) * 100)}%)`);
+ lines.push(` Total filings: ${filingQuality.size.toLocaleString()}`);
+ lines.push("");
+
+ // 2. Paragraph-level quality flags
+ const allFlags = new Map();
+ for (const [, flags] of paragraphFlags) {
+ for (const f of flags) {
+ allFlags.set(f.flag, (allFlags.get(f.flag) ?? 0) + 1);
+ }
+ }
+
+ lines.push("2. PARAGRAPH QUALITY FLAGS");
+ lines.push(sr);
+ for (const [flag, count] of [...allFlags.entries()].sort((a, b) => b[1] - a[1])) {
+ const pct = round((count / paragraphs.length) * 100);
+ lines.push(` ${flag.padEnd(25)} ${String(count).padStart(7)} (${String(pct).padStart(5)}%)`);
+ }
+ lines.push("");
+
+ // 3. Filing-level quality
+ const filingFlags = new Map();
+ for (const [, fq] of filingQuality) {
+ for (const f of fq.flags) {
+ filingFlags.set(f.flag, (filingFlags.get(f.flag) ?? 0) + 1);
+ }
+ }
+
+ lines.push("3. FILING QUALITY FLAGS");
+ lines.push(sr);
+ for (const [flag, count] of [...filingFlags.entries()].sort((a, b) => b[1] - a[1])) {
+ lines.push(` ${flag.padEnd(30)} ${count} filings`);
+ }
+ lines.push("");
+
+ // 4. Dedup categories
+ const dedupCounts = { unique: 0, cross_company_boilerplate: 0, cross_year_copy: 0, within_filing_dup: 0 };
+ const canonicalCounts = { ...dedupCounts };
+ for (const [, infos] of dedupMap) {
+ for (const info of infos) {
+ dedupCounts[info.dedupCategory]++;
+ if (info.isCanonical) canonicalCounts[info.dedupCategory]++;
+ }
+ }
+
+ lines.push("4. DEDUPLICATION CATEGORIES (all paragraphs)");
+ lines.push(sr);
+ for (const [cat, count] of Object.entries(dedupCounts)) {
+ const pct = round((count / paragraphs.length) * 100);
+ const canonical = canonicalCounts[cat as keyof typeof canonicalCounts];
+ lines.push(` ${cat.padEnd(30)} ${String(count).padStart(7)} total (${String(pct).padStart(5)}%) │ ${canonical} canonical`);
+ }
+ lines.push("");
+
+ // 5. Boilerplate templates (top cross-company)
+ const boilerplateGroups: Array<{ hash: string; count: number; companies: number; sample: string }> = [];
+ for (const [hash, infos] of dedupMap) {
+ if (infos.length > 0 && infos[0]!.dedupCategory === "cross_company_boilerplate") {
+ const samplePara = paragraphs.find((p) => p.textHash === hash);
+ boilerplateGroups.push({
+ hash,
+ count: infos.length,
+ companies: infos[0]!.uniqueCompanies,
+ sample: samplePara?.text.slice(0, 150) ?? "",
+ });
+ }
+ }
+ boilerplateGroups.sort((a, b) => b.count - a.count);
+
+ lines.push("5. TOP 20 BOILERPLATE TEMPLATES (cross-company identical text)");
+ lines.push(sr);
+ for (const bg of boilerplateGroups.slice(0, 20)) {
+ lines.push(` [${bg.count} copies, ${bg.companies} companies]`);
+ lines.push(` "${bg.sample}..."`);
+ lines.push("");
+ }
+
+ // 6. Filings with quality issues (top 20)
+ const problemFilings = [...filingQuality.values()]
+ .filter((fq) => fq.flags.length > 0)
+ .sort((a, b) => {
+ const aErrors = a.flags.filter((f) => f.severity === "error").length;
+ const bErrors = b.flags.filter((f) => f.severity === "error").length;
+ return bErrors - aErrors || b.paragraphCount - a.paragraphCount;
+ });
+
+ lines.push("6. FILINGS WITH QUALITY ISSUES (top 20)");
+ lines.push(sr);
+ for (const fq of problemFilings.slice(0, 20)) {
+ lines.push(` ${fq.companyName.slice(0, 40).padEnd(42)} ${fq.paragraphCount} paras, ${fq.totalWords} words`);
+ for (const f of fq.flags) {
+ lines.push(` [${f.severity}] ${f.description}`);
+ }
+ lines.push("");
+ }
+
+ // 7. Clean dataset summary for labeling
+ lines.push("7. CLEAN DATASET SUMMARY");
+ lines.push(sr);
+ lines.push(` Total paragraphs for labeling: ${cleanCount.toLocaleString()}`);
+ const uniqueInClean = canonicalCounts.unique +
+ canonicalCounts.cross_company_boilerplate +
+ canonicalCounts.cross_year_copy;
+ lines.push(` Unique texts (canonical only): ~${uniqueInClean.toLocaleString()}`);
+ lines.push(` Labeling strategy: label canonical paragraphs, propagate to duplicates`);
+ lines.push("");
+
+ lines.push(hr);
+ lines.push(" END OF QUALITY REPORT");
+ lines.push(hr);
+
+ return lines.join("\n");
+}
+
+// ─── Main ───
+
+async function main() {
+ process.stderr.write(" Loading paragraphs...\n");
+ const { records: paragraphs, skipped } = await readJsonl(INPUT_PATH, Paragraph);
+ if (skipped > 0) process.stderr.write(` Warning: ${skipped} invalid lines skipped\n`);
+ process.stderr.write(` Loaded ${paragraphs.length} paragraphs\n\n`);
+
+ // Step 1: Compute dedup info
+ process.stderr.write(" Computing dedup metadata...\n");
+ const dedupMap = computeDedupInfo(paragraphs);
+
+ // Step 2: Check paragraph quality
+ process.stderr.write(" Checking paragraph quality...\n");
+ const paragraphFlags = new Map();
+ for (const p of paragraphs) {
+ paragraphFlags.set(p.id, checkParagraphQuality(p));
+ }
+
+ // Step 3: Check filing quality
+ process.stderr.write(" Checking filing quality...\n");
+ const filingQuality = analyzeFilingQuality(paragraphs);
+
+ // Step 4: Build clean dataset
+ // Remove: page artifacts, within-filing duplicates (keep canonical only)
+ process.stderr.write(" Building clean dataset...\n");
+
+ // Build a per-paragraph dedup info lookup
+ const hashCounters = new Map();
+ const dedupInfoByParagraph = new Map();
+ for (const p of paragraphs) {
+ const counter = hashCounters.get(p.textHash) ?? 0;
+ hashCounters.set(p.textHash, counter + 1);
+
+ const infos = dedupMap.get(p.textHash);
+ if (infos && counter < infos.length) {
+ dedupInfoByParagraph.set(p.id, infos[counter]!);
+ }
+ }
+
+ const cleanParagraphs: ParagraphType[] = [];
+ let removed = 0;
+
+ for (const p of paragraphs) {
+ const flags = paragraphFlags.get(p.id) ?? [];
+ const hasError = flags.some((f) => f.severity === "error");
+
+ if (hasError) {
+ removed++;
+ continue;
+ }
+
+ cleanParagraphs.push(p);
+ }
+
+ // Write clean dataset
+ await writeJsonl(CLEAN_PATH, cleanParagraphs);
+ process.stderr.write(` Written ${cleanParagraphs.length} clean paragraphs to ${CLEAN_PATH}\n`);
+
+ // Step 5: Generate reports
+ const report = generateReport(paragraphs, dedupMap, filingQuality, paragraphFlags, cleanParagraphs.length, removed);
+
+ const reportPath = `${DATA}/analysis/quality-report.txt`;
+ const jsonPath = `${DATA}/analysis/quality-report.json`;
+ await mkdir(dirname(reportPath), { recursive: true });
+ await writeFile(reportPath, report);
+
+ // JSON report
+ const jsonReport = {
+ generatedAt: new Date().toISOString(),
+ input: { paragraphs: paragraphs.length, filings: filingQuality.size },
+ clean: { paragraphs: cleanParagraphs.length, removed },
+ dedupCategories: {
+ unique: { total: 0, canonical: 0 },
+ cross_company_boilerplate: { total: 0, canonical: 0 },
+ cross_year_copy: { total: 0, canonical: 0 },
+ within_filing_dup: { total: 0, canonical: 0 },
+ } as Record,
+ paragraphFlags: Object.fromEntries([...new Map()].map(([k, v]) => [k, v])),
+ filingFlags: Object.fromEntries([...new Map()].map(([k, v]) => [k, v])),
+ };
+
+ // Populate dedup counts
+ for (const [, infos] of dedupMap) {
+ for (const info of infos) {
+ jsonReport.dedupCategories[info.dedupCategory]!.total++;
+ if (info.isCanonical) jsonReport.dedupCategories[info.dedupCategory]!.canonical++;
+ }
+ }
+
+ // Populate flag counts
+ const flagCounts = new Map();
+ for (const [, flags] of paragraphFlags) {
+ for (const f of flags) {
+ flagCounts.set(f.flag, (flagCounts.get(f.flag) ?? 0) + 1);
+ }
+ }
+ jsonReport.paragraphFlags = Object.fromEntries(flagCounts);
+
+ const filingFlagCounts = new Map();
+ for (const [, fq] of filingQuality) {
+ for (const f of fq.flags) {
+ filingFlagCounts.set(f.flag, (filingFlagCounts.get(f.flag) ?? 0) + 1);
+ }
+ }
+ jsonReport.filingFlags = Object.fromEntries(filingFlagCounts);
+
+ await writeFile(jsonPath, JSON.stringify(jsonReport, null, 2));
+
+ process.stderr.write(` Saved: ${reportPath}\n`);
+ process.stderr.write(` Saved: ${jsonPath}\n`);
+ console.log(report);
+}
+
+main().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
diff --git a/ts/src/analyze/debug-parser.ts b/ts/src/analyze/debug-parser.ts
new file mode 100644
index 0000000..0930ddf
--- /dev/null
+++ b/ts/src/analyze/debug-parser.ts
@@ -0,0 +1,71 @@
+import { readFileSync } from "node:fs";
+
+const acc = process.argv[2] ?? "0001493152-24-018935"; // MARIZYME default
+const html = readFileSync(`../data/raw/html/${acc}.html`, "utf-8");
+
+const stripped = html
+ .replace(/