/** * Sample and print full paragraph text for the hardest dispute types. * * Prints 5 paragraphs from each of 4 dispute categories (20 total), * with all 3 model annotations and company metadata. * * Usage: bun ts/scripts/sample-disputes.ts */ import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts"; import { Paragraph } from "@sec-cybert/schemas/paragraph.ts"; const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname; const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname; // ── Types ────────────────────────────────────────────────────────────── interface Ann { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string; reasoning: string; }; provenance: { modelId: string; }; } type Paragraph = (typeof Paragraph)["_output"]; // ── Helpers ──────────────────────────────────────────────────────────── function majority(arr: T[]): { value: T; count: number } | null { const counts = new Map(); for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1); let best: T | null = null; let bestCount = 0; for (const [v, c] of counts) { if (c > bestCount) { best = v; bestCount = c; } } return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null; } /** Deterministic sample: sort by ID, then take every Nth to get `count` items from diverse companies. */ function deterministicSample( candidates: { id: string; companyName: string }[], count: number, ): typeof candidates { if (candidates.length <= count) return candidates; // Sort by ID for determinism const sorted = [...candidates].sort((a, b) => a.id.localeCompare(b.id)); // Greedily pick every Nth, skipping if company already seen (for diversity) const step = Math.floor(sorted.length / (count * 3)); // oversample to allow skipping const picked: typeof candidates = []; const seenCompanies = new Set(); for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) { const item = sorted[i]; if (!seenCompanies.has(item.companyName)) { picked.push(item); seenCompanies.add(item.companyName); } } // If we couldn't get enough diverse companies, fill from remaining if (picked.length < count) { for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) { if (!picked.includes(sorted[i])) { picked.push(sorted[i]); } } } return picked.slice(0, count); } function shortModel(modelId: string): string { // "google/gemini-3.1-flash-lite-preview" → "gemini-3.1-flash-lite" const name = modelId.split("/").pop() ?? modelId; return name.replace(/-preview$/, "").slice(0, 30); } function printSample( para: Paragraph, anns: Ann[], index: number, ) { console.log(`\n${"─".repeat(80)}`); console.log(` [${index}] ${para.filing.companyName} (${para.filing.ticker}) — ${para.filing.filingType} ${para.filing.filingDate}`); console.log(` Paragraph ${para.paragraphIndex + 1}, ${para.wordCount} words, ID: ${para.id}`); console.log(`${"─".repeat(80)}`); console.log(); console.log(para.text); console.log(); for (const a of anns) { const model = shortModel(a.provenance.modelId); console.log(` ┌─ ${model}`); console.log(` │ Category: ${a.label.content_category} (${a.label.category_confidence})`); console.log(` │ Specificity: ${a.label.specificity_level} (${a.label.specificity_confidence})`); console.log(` │ Reasoning: ${a.label.reasoning}`); console.log(` └─`); } } // ── Main ─────────────────────────────────────────────────────────────── async function main() { console.log("Loading paragraphs..."); const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph); console.log(` ${paragraphs.length.toLocaleString()} paragraphs`); const paraById = new Map(paragraphs.map(p => [p.id, p])); console.log("Loading annotations..."); const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS); const anns = rawAnns as Ann[]; console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`); // Group annotations by paragraph const byParagraph = new Map(); for (const a of anns) { let arr = byParagraph.get(a.paragraphId); if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); } arr.push(a); } // Count paragraphs per company const companyParaCount = new Map(); for (const p of paragraphs) { const name = p.filing.companyName; companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1); } // ── Build candidate pools ──────────────────────────────────────────── // Pool 1: Spec [3,4] disputes — specs are [3,3,4] or [3,4,4] const specDisputes: { id: string; companyName: string }[] = []; // Pool 2: Management ↔ RMP disputes — 2 say one, 1 says the other const mgmtRmpDisputes: { id: string; companyName: string }[] = []; // Pool 3: None/Other ↔ Strategy disputes const noneStrategyDisputes: { id: string; companyName: string }[] = []; // Pool 4: Small company (1-3 paras) + unresolved (3-way cat split) const smallUnresolved: { id: string; companyName: string }[] = []; for (const [pid, pannAnns] of byParagraph) { if (pannAnns.length < 3) continue; const para = paraById.get(pid); if (!para) continue; const cats = pannAnns.map(a => a.label.content_category); const specs = pannAnns.map(a => a.label.specificity_level); const sortedSpecs = [...specs].sort((a, b) => a - b); const sortedCats = [...cats].sort(); const companyName = para.filing.companyName; // Pool 1: Spec [3,3,4] or [3,4,4] const specKey = sortedSpecs.join(","); if (specKey === "3,3,4" || specKey === "3,4,4") { specDisputes.push({ id: pid, companyName }); } // Pool 2: Management ↔ RMP (2-1 split in either direction) const mgmtCount = cats.filter(c => c === "Management Role").length; const rmpCount = cats.filter(c => c === "Risk Management Process").length; if ((mgmtCount === 2 && rmpCount === 1) || (mgmtCount === 1 && rmpCount === 2)) { mgmtRmpDisputes.push({ id: pid, companyName }); } // Pool 3: None/Other ↔ Strategy Integration (2-1 split in either direction) const noneCount = cats.filter(c => c === "None/Other").length; const stratCount = cats.filter(c => c === "Strategy Integration").length; if ((noneCount === 2 && stratCount === 1) || (noneCount === 1 && stratCount === 2)) { noneStrategyDisputes.push({ id: pid, companyName }); } // Pool 4: Small company (1-3 paras) with 3-way cat split (unresolved) const paraCount = companyParaCount.get(companyName) ?? 0; const uniqueCats = new Set(cats); if (paraCount <= 3 && uniqueCats.size === 3) { smallUnresolved.push({ id: pid, companyName }); } } console.log(`\nCandidate pools:`); console.log(` Spec [3,4] disputes: ${specDisputes.length}`); console.log(` Management ↔ RMP disputes: ${mgmtRmpDisputes.length}`); console.log(` None/Other ↔ Strategy disputes: ${noneStrategyDisputes.length}`); console.log(` Small co. unresolved: ${smallUnresolved.length}`); // ── Sample and print ──────────────────────────────────────────────── const sections: [string, { id: string; companyName: string }[]][] = [ ["SPEC [3,4] DISPUTES — Models can't decide firm-specific vs quantified-verifiable", specDisputes], ["MANAGEMENT ↔ RMP DISPUTES — 2-vs-1 split between Management Role and Risk Management Process", mgmtRmpDisputes], ["NONE/OTHER ↔ STRATEGY INTEGRATION DISPUTES — 2-vs-1 split between None/Other and Strategy Integration", noneStrategyDisputes], ["SMALL COMPANY (1-3 PARAS) UNRESOLVED — 3-way category split, tiny filings", smallUnresolved], ]; let globalIdx = 1; for (const [title, pool] of sections) { console.log(`\n${"═".repeat(80)}`); console.log(` ${title}`); console.log(`${"═".repeat(80)}`); const sampled = deterministicSample(pool, 5); if (sampled.length === 0) { console.log("\n (no candidates found)"); continue; } for (const item of sampled) { const para = paraById.get(item.id)!; const pannAnns = byParagraph.get(item.id)!; printSample(para, pannAnns, globalIdx++); } } console.log(`\n${"═".repeat(80)}`); console.log(` Done — ${globalIdx - 1} paragraphs printed.`); console.log(`${"═".repeat(80)}`); } main().catch(console.error);