SEC-cyBERT/ts/scripts/sample-disputes.ts
2026-03-28 23:44:37 -04:00

230 lines
9.1 KiB
TypeScript

/**
* Sample and print full paragraph text for the hardest dispute types.
*
* Prints 5 paragraphs from each of 4 dispute categories (20 total),
* with all 3 model annotations and company metadata.
*
* Usage: bun ts/scripts/sample-disputes.ts
*/
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
// ── Types ──────────────────────────────────────────────────────────────
interface Ann {
paragraphId: string;
label: {
content_category: string;
specificity_level: number;
category_confidence: string;
specificity_confidence: string;
reasoning: string;
};
provenance: {
modelId: string;
};
}
type Paragraph = (typeof Paragraph)["_output"];
// ── Helpers ────────────────────────────────────────────────────────────
function majority<T>(arr: T[]): { value: T; count: number } | null {
const counts = new Map<T, number>();
for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
let best: T | null = null;
let bestCount = 0;
for (const [v, c] of counts) {
if (c > bestCount) { best = v; bestCount = c; }
}
return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
}
/** Deterministic sample: sort by ID, then take every Nth to get `count` items from diverse companies. */
function deterministicSample(
candidates: { id: string; companyName: string }[],
count: number,
): typeof candidates {
if (candidates.length <= count) return candidates;
// Sort by ID for determinism
const sorted = [...candidates].sort((a, b) => a.id.localeCompare(b.id));
// Greedily pick every Nth, skipping if company already seen (for diversity)
const step = Math.floor(sorted.length / (count * 3)); // oversample to allow skipping
const picked: typeof candidates = [];
const seenCompanies = new Set<string>();
for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
const item = sorted[i];
if (!seenCompanies.has(item.companyName)) {
picked.push(item);
seenCompanies.add(item.companyName);
}
}
// If we couldn't get enough diverse companies, fill from remaining
if (picked.length < count) {
for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
if (!picked.includes(sorted[i])) {
picked.push(sorted[i]);
}
}
}
return picked.slice(0, count);
}
function shortModel(modelId: string): string {
// "google/gemini-3.1-flash-lite-preview" → "gemini-3.1-flash-lite"
const name = modelId.split("/").pop() ?? modelId;
return name.replace(/-preview$/, "").slice(0, 30);
}
function printSample(
para: Paragraph,
anns: Ann[],
index: number,
) {
console.log(`\n${"─".repeat(80)}`);
console.log(` [${index}] ${para.filing.companyName} (${para.filing.ticker}) — ${para.filing.filingType} ${para.filing.filingDate}`);
console.log(` Paragraph ${para.paragraphIndex + 1}, ${para.wordCount} words, ID: ${para.id}`);
console.log(`${"─".repeat(80)}`);
console.log();
console.log(para.text);
console.log();
for (const a of anns) {
const model = shortModel(a.provenance.modelId);
console.log(` ┌─ ${model}`);
console.log(` │ Category: ${a.label.content_category} (${a.label.category_confidence})`);
console.log(` │ Specificity: ${a.label.specificity_level} (${a.label.specificity_confidence})`);
console.log(` │ Reasoning: ${a.label.reasoning}`);
console.log(` └─`);
}
}
// ── Main ───────────────────────────────────────────────────────────────
async function main() {
console.log("Loading paragraphs...");
const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
console.log(` ${paragraphs.length.toLocaleString()} paragraphs`);
const paraById = new Map(paragraphs.map(p => [p.id, p]));
console.log("Loading annotations...");
const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
const anns = rawAnns as Ann[];
console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);
// Group annotations by paragraph
const byParagraph = new Map<string, Ann[]>();
for (const a of anns) {
let arr = byParagraph.get(a.paragraphId);
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
arr.push(a);
}
// Count paragraphs per company
const companyParaCount = new Map<string, number>();
for (const p of paragraphs) {
const name = p.filing.companyName;
companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
}
// ── Build candidate pools ────────────────────────────────────────────
// Pool 1: Spec [3,4] disputes — specs are [3,3,4] or [3,4,4]
const specDisputes: { id: string; companyName: string }[] = [];
// Pool 2: Management ↔ RMP disputes — 2 say one, 1 says the other
const mgmtRmpDisputes: { id: string; companyName: string }[] = [];
// Pool 3: None/Other ↔ Strategy disputes
const noneStrategyDisputes: { id: string; companyName: string }[] = [];
// Pool 4: Small company (1-3 paras) + unresolved (3-way cat split)
const smallUnresolved: { id: string; companyName: string }[] = [];
for (const [pid, pannAnns] of byParagraph) {
if (pannAnns.length < 3) continue;
const para = paraById.get(pid);
if (!para) continue;
const cats = pannAnns.map(a => a.label.content_category);
const specs = pannAnns.map(a => a.label.specificity_level);
const sortedSpecs = [...specs].sort((a, b) => a - b);
const sortedCats = [...cats].sort();
const companyName = para.filing.companyName;
// Pool 1: Spec [3,3,4] or [3,4,4]
const specKey = sortedSpecs.join(",");
if (specKey === "3,3,4" || specKey === "3,4,4") {
specDisputes.push({ id: pid, companyName });
}
// Pool 2: Management ↔ RMP (2-1 split in either direction)
const mgmtCount = cats.filter(c => c === "Management Role").length;
const rmpCount = cats.filter(c => c === "Risk Management Process").length;
if ((mgmtCount === 2 && rmpCount === 1) || (mgmtCount === 1 && rmpCount === 2)) {
mgmtRmpDisputes.push({ id: pid, companyName });
}
// Pool 3: None/Other ↔ Strategy Integration (2-1 split in either direction)
const noneCount = cats.filter(c => c === "None/Other").length;
const stratCount = cats.filter(c => c === "Strategy Integration").length;
if ((noneCount === 2 && stratCount === 1) || (noneCount === 1 && stratCount === 2)) {
noneStrategyDisputes.push({ id: pid, companyName });
}
// Pool 4: Small company (1-3 paras) with 3-way cat split (unresolved)
const paraCount = companyParaCount.get(companyName) ?? 0;
const uniqueCats = new Set(cats);
if (paraCount <= 3 && uniqueCats.size === 3) {
smallUnresolved.push({ id: pid, companyName });
}
}
console.log(`\nCandidate pools:`);
console.log(` Spec [3,4] disputes: ${specDisputes.length}`);
console.log(` Management ↔ RMP disputes: ${mgmtRmpDisputes.length}`);
console.log(` None/Other ↔ Strategy disputes: ${noneStrategyDisputes.length}`);
console.log(` Small co. unresolved: ${smallUnresolved.length}`);
// ── Sample and print ────────────────────────────────────────────────
const sections: [string, { id: string; companyName: string }[]][] = [
["SPEC [3,4] DISPUTES — Models can't decide firm-specific vs quantified-verifiable", specDisputes],
["MANAGEMENT ↔ RMP DISPUTES — 2-vs-1 split between Management Role and Risk Management Process", mgmtRmpDisputes],
["NONE/OTHER ↔ STRATEGY INTEGRATION DISPUTES — 2-vs-1 split between None/Other and Strategy Integration", noneStrategyDisputes],
["SMALL COMPANY (1-3 PARAS) UNRESOLVED — 3-way category split, tiny filings", smallUnresolved],
];
let globalIdx = 1;
for (const [title, pool] of sections) {
console.log(`\n${"═".repeat(80)}`);
console.log(` ${title}`);
console.log(`${"═".repeat(80)}`);
const sampled = deterministicSample(pool, 5);
if (sampled.length === 0) {
console.log("\n (no candidates found)");
continue;
}
for (const item of sampled) {
const para = paraById.get(item.id)!;
const pannAnns = byParagraph.get(item.id)!;
printSample(para, pannAnns, globalIdx++);
}
}
console.log(`\n${"═".repeat(80)}`);
console.log(` Done — ${globalIdx - 1} paragraphs printed.`);
console.log(`${"═".repeat(80)}`);
}
main().catch(console.error);