230 lines
9.1 KiB
TypeScript
230 lines
9.1 KiB
TypeScript
/**
|
|
* Sample and print full paragraph text for the hardest dispute types.
|
|
*
|
|
* Prints 5 paragraphs from each of 4 dispute categories (20 total),
|
|
* with all 3 model annotations and company metadata.
|
|
*
|
|
* Usage: bun ts/scripts/sample-disputes.ts
|
|
*/
|
|
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
|
|
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
|
|
|
|
const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
|
|
const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
|
|
|
|
// ── Types ──────────────────────────────────────────────────────────────
|
|
interface Ann {
|
|
paragraphId: string;
|
|
label: {
|
|
content_category: string;
|
|
specificity_level: number;
|
|
category_confidence: string;
|
|
specificity_confidence: string;
|
|
reasoning: string;
|
|
};
|
|
provenance: {
|
|
modelId: string;
|
|
};
|
|
}
|
|
|
|
type Paragraph = (typeof Paragraph)["_output"];
|
|
|
|
// ── Helpers ────────────────────────────────────────────────────────────
|
|
function majority<T>(arr: T[]): { value: T; count: number } | null {
|
|
const counts = new Map<T, number>();
|
|
for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
|
|
let best: T | null = null;
|
|
let bestCount = 0;
|
|
for (const [v, c] of counts) {
|
|
if (c > bestCount) { best = v; bestCount = c; }
|
|
}
|
|
return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
|
|
}
|
|
|
|
/** Deterministic sample: sort by ID, then take every Nth to get `count` items from diverse companies. */
|
|
function deterministicSample(
|
|
candidates: { id: string; companyName: string }[],
|
|
count: number,
|
|
): typeof candidates {
|
|
if (candidates.length <= count) return candidates;
|
|
|
|
// Sort by ID for determinism
|
|
const sorted = [...candidates].sort((a, b) => a.id.localeCompare(b.id));
|
|
|
|
// Greedily pick every Nth, skipping if company already seen (for diversity)
|
|
const step = Math.floor(sorted.length / (count * 3)); // oversample to allow skipping
|
|
const picked: typeof candidates = [];
|
|
const seenCompanies = new Set<string>();
|
|
|
|
for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
|
|
const item = sorted[i];
|
|
if (!seenCompanies.has(item.companyName)) {
|
|
picked.push(item);
|
|
seenCompanies.add(item.companyName);
|
|
}
|
|
}
|
|
|
|
// If we couldn't get enough diverse companies, fill from remaining
|
|
if (picked.length < count) {
|
|
for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
|
|
if (!picked.includes(sorted[i])) {
|
|
picked.push(sorted[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
return picked.slice(0, count);
|
|
}
|
|
|
|
function shortModel(modelId: string): string {
|
|
// "google/gemini-3.1-flash-lite-preview" → "gemini-3.1-flash-lite"
|
|
const name = modelId.split("/").pop() ?? modelId;
|
|
return name.replace(/-preview$/, "").slice(0, 30);
|
|
}
|
|
|
|
function printSample(
|
|
para: Paragraph,
|
|
anns: Ann[],
|
|
index: number,
|
|
) {
|
|
console.log(`\n${"─".repeat(80)}`);
|
|
console.log(` [${index}] ${para.filing.companyName} (${para.filing.ticker}) — ${para.filing.filingType} ${para.filing.filingDate}`);
|
|
console.log(` Paragraph ${para.paragraphIndex + 1}, ${para.wordCount} words, ID: ${para.id}`);
|
|
console.log(`${"─".repeat(80)}`);
|
|
console.log();
|
|
console.log(para.text);
|
|
console.log();
|
|
|
|
for (const a of anns) {
|
|
const model = shortModel(a.provenance.modelId);
|
|
console.log(` ┌─ ${model}`);
|
|
console.log(` │ Category: ${a.label.content_category} (${a.label.category_confidence})`);
|
|
console.log(` │ Specificity: ${a.label.specificity_level} (${a.label.specificity_confidence})`);
|
|
console.log(` │ Reasoning: ${a.label.reasoning}`);
|
|
console.log(` └─`);
|
|
}
|
|
}
|
|
|
|
// ── Main ───────────────────────────────────────────────────────────────
|
|
async function main() {
|
|
console.log("Loading paragraphs...");
|
|
const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
|
|
console.log(` ${paragraphs.length.toLocaleString()} paragraphs`);
|
|
|
|
const paraById = new Map(paragraphs.map(p => [p.id, p]));
|
|
|
|
console.log("Loading annotations...");
|
|
const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
|
|
const anns = rawAnns as Ann[];
|
|
console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);
|
|
|
|
// Group annotations by paragraph
|
|
const byParagraph = new Map<string, Ann[]>();
|
|
for (const a of anns) {
|
|
let arr = byParagraph.get(a.paragraphId);
|
|
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
|
|
arr.push(a);
|
|
}
|
|
|
|
// Count paragraphs per company
|
|
const companyParaCount = new Map<string, number>();
|
|
for (const p of paragraphs) {
|
|
const name = p.filing.companyName;
|
|
companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
|
|
}
|
|
|
|
// ── Build candidate pools ────────────────────────────────────────────
|
|
|
|
// Pool 1: Spec [3,4] disputes — specs are [3,3,4] or [3,4,4]
|
|
const specDisputes: { id: string; companyName: string }[] = [];
|
|
|
|
// Pool 2: Management ↔ RMP disputes — 2 say one, 1 says the other
|
|
const mgmtRmpDisputes: { id: string; companyName: string }[] = [];
|
|
|
|
// Pool 3: None/Other ↔ Strategy disputes
|
|
const noneStrategyDisputes: { id: string; companyName: string }[] = [];
|
|
|
|
// Pool 4: Small company (1-3 paras) + unresolved (3-way cat split)
|
|
const smallUnresolved: { id: string; companyName: string }[] = [];
|
|
|
|
for (const [pid, pannAnns] of byParagraph) {
|
|
if (pannAnns.length < 3) continue;
|
|
const para = paraById.get(pid);
|
|
if (!para) continue;
|
|
|
|
const cats = pannAnns.map(a => a.label.content_category);
|
|
const specs = pannAnns.map(a => a.label.specificity_level);
|
|
const sortedSpecs = [...specs].sort((a, b) => a - b);
|
|
const sortedCats = [...cats].sort();
|
|
|
|
const companyName = para.filing.companyName;
|
|
|
|
// Pool 1: Spec [3,3,4] or [3,4,4]
|
|
const specKey = sortedSpecs.join(",");
|
|
if (specKey === "3,3,4" || specKey === "3,4,4") {
|
|
specDisputes.push({ id: pid, companyName });
|
|
}
|
|
|
|
// Pool 2: Management ↔ RMP (2-1 split in either direction)
|
|
const mgmtCount = cats.filter(c => c === "Management Role").length;
|
|
const rmpCount = cats.filter(c => c === "Risk Management Process").length;
|
|
if ((mgmtCount === 2 && rmpCount === 1) || (mgmtCount === 1 && rmpCount === 2)) {
|
|
mgmtRmpDisputes.push({ id: pid, companyName });
|
|
}
|
|
|
|
// Pool 3: None/Other ↔ Strategy Integration (2-1 split in either direction)
|
|
const noneCount = cats.filter(c => c === "None/Other").length;
|
|
const stratCount = cats.filter(c => c === "Strategy Integration").length;
|
|
if ((noneCount === 2 && stratCount === 1) || (noneCount === 1 && stratCount === 2)) {
|
|
noneStrategyDisputes.push({ id: pid, companyName });
|
|
}
|
|
|
|
// Pool 4: Small company (1-3 paras) with 3-way cat split (unresolved)
|
|
const paraCount = companyParaCount.get(companyName) ?? 0;
|
|
const uniqueCats = new Set(cats);
|
|
if (paraCount <= 3 && uniqueCats.size === 3) {
|
|
smallUnresolved.push({ id: pid, companyName });
|
|
}
|
|
}
|
|
|
|
console.log(`\nCandidate pools:`);
|
|
console.log(` Spec [3,4] disputes: ${specDisputes.length}`);
|
|
console.log(` Management ↔ RMP disputes: ${mgmtRmpDisputes.length}`);
|
|
console.log(` None/Other ↔ Strategy disputes: ${noneStrategyDisputes.length}`);
|
|
console.log(` Small co. unresolved: ${smallUnresolved.length}`);
|
|
|
|
// ── Sample and print ────────────────────────────────────────────────
|
|
|
|
const sections: [string, { id: string; companyName: string }[]][] = [
|
|
["SPEC [3,4] DISPUTES — Models can't decide firm-specific vs quantified-verifiable", specDisputes],
|
|
["MANAGEMENT ↔ RMP DISPUTES — 2-vs-1 split between Management Role and Risk Management Process", mgmtRmpDisputes],
|
|
["NONE/OTHER ↔ STRATEGY INTEGRATION DISPUTES — 2-vs-1 split between None/Other and Strategy Integration", noneStrategyDisputes],
|
|
["SMALL COMPANY (1-3 PARAS) UNRESOLVED — 3-way category split, tiny filings", smallUnresolved],
|
|
];
|
|
|
|
let globalIdx = 1;
|
|
for (const [title, pool] of sections) {
|
|
console.log(`\n${"═".repeat(80)}`);
|
|
console.log(` ${title}`);
|
|
console.log(`${"═".repeat(80)}`);
|
|
|
|
const sampled = deterministicSample(pool, 5);
|
|
if (sampled.length === 0) {
|
|
console.log("\n (no candidates found)");
|
|
continue;
|
|
}
|
|
|
|
for (const item of sampled) {
|
|
const para = paraById.get(item.id)!;
|
|
const pannAnns = byParagraph.get(item.id)!;
|
|
printSample(para, pannAnns, globalIdx++);
|
|
}
|
|
}
|
|
|
|
console.log(`\n${"═".repeat(80)}`);
|
|
console.log(` Done — ${globalIdx - 1} paragraphs printed.`);
|
|
console.log(`${"═".repeat(80)}`);
|
|
}
|
|
|
|
main().catch(console.error);
|