SEC-cyBERT/ts/scripts/dispute-crosstab.ts
2026-03-28 20:39:36 -04:00

502 lines
24 KiB
TypeScript

/**
* Detailed cross-tabulations for disputed (non-unanimous) paragraphs.
*
* Usage: bun ts/scripts/dispute-crosstab.ts
*/
import { readJsonlRaw, readJsonl } from "../src/lib/jsonl.ts";
import { Paragraph } from "../src/schemas/paragraph.ts";
const ANN_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
const PARA_PATH = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
interface Ann {
paragraphId: string;
label: {
content_category: string;
specificity_level: number;
category_confidence: string;
specificity_confidence: string;
reasoning: string;
};
provenance: {
modelId: string;
costUsd: number;
inputTokens: number;
outputTokens: number;
reasoningTokens: number;
latencyMs: number;
requestedAt: string;
};
}
// ── Helpers ────────────────────────────────────────────────────────────
function pct(n: number, total: number): string {
if (total === 0) return "0.0%";
return `${((n / total) * 100).toFixed(1)}%`;
}
function median(arr: number[]): number {
if (arr.length === 0) return 0;
const sorted = [...arr].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
}
function percentile(arr: number[], p: number): number {
if (arr.length === 0) return 0;
const sorted = [...arr].sort((a, b) => a - b);
const idx = (p / 100) * (sorted.length - 1);
const lo = Math.floor(idx);
const hi = Math.ceil(idx);
return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
}
function majority<T>(arr: T[]): T | null {
const freq = new Map<T, number>();
for (const v of arr) freq.set(v, (freq.get(v) ?? 0) + 1);
for (const [val, count] of freq) {
if (count >= 2) return val;
}
return null;
}
function sortedVals(arr: number[]): string {
return `[${[...arr].sort((a, b) => a - b).join(",")}]`;
}
function uniqueSorted(arr: string[]): string[] {
return [...new Set(arr)].sort();
}
// ── Main ──────────────────────────────────────────────────────────────
async function main() {
console.log("Loading data...");
const [{ records: rawAnns, skipped: annSkipped }, { records: paragraphs, skipped: paraSkipped }] =
await Promise.all([
readJsonlRaw(ANN_PATH),
readJsonl(PARA_PATH, Paragraph),
]);
const anns = rawAnns as Ann[];
console.log(` ${anns.length.toLocaleString()} annotations (${annSkipped} skipped)`);
console.log(` ${paragraphs.length.toLocaleString()} paragraphs (${paraSkipped} skipped)\n`);
// Index paragraphs by id
const paraById = new Map(paragraphs.map(p => [p.id, p]));
// Group annotations by paragraph
const byParagraph = new Map<string, Ann[]>();
for (const a of anns) {
let arr = byParagraph.get(a.paragraphId);
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
arr.push(a);
}
// Classify each paragraph
interface ParaInfo {
pid: string;
cats: string[];
specs: number[];
catUnanimous: boolean;
specUnanimous: boolean;
majCat: string | null;
majSpec: number | null;
catDisputed: boolean;
specDisputed: boolean;
disputeType: "none" | "cat-only" | "spec-only" | "both";
wordCount: number;
}
const allParas: ParaInfo[] = [];
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
const cats = panns.map(a => a.label.content_category);
const specs = panns.map(a => a.label.specificity_level);
const catU = new Set(cats).size === 1;
const specU = new Set(specs).size === 1;
const majCat = majority(cats);
const majSpec = majority(specs);
const catDisputed = !catU;
const specDisputed = !specU;
let disputeType: ParaInfo["disputeType"] = "none";
if (catDisputed && !specDisputed) disputeType = "cat-only";
else if (!catDisputed && specDisputed) disputeType = "spec-only";
else if (catDisputed && specDisputed) disputeType = "both";
const para = paraById.get(pid);
allParas.push({
pid,
cats,
specs,
catUnanimous: catU,
specUnanimous: specU,
majCat,
majSpec,
catDisputed,
specDisputed,
disputeType,
wordCount: para?.wordCount ?? 0,
});
}
const disputed = allParas.filter(p => p.disputeType !== "none");
const catOnly = allParas.filter(p => p.disputeType === "cat-only");
const specOnly = allParas.filter(p => p.disputeType === "spec-only");
const bothDisputed = allParas.filter(p => p.disputeType === "both");
console.log("═══════════════════════════════════════════════════════════════════");
console.log(" DISPUTE CROSS-TABULATION ANALYSIS");
console.log("═══════════════════════════════════════════════════════════════════");
console.log(` Total paragraphs (3-annotator): ${allParas.length.toLocaleString()}`);
console.log(` Disputed (not both-unanimous): ${disputed.length.toLocaleString()} (${pct(disputed.length, allParas.length)})`);
console.log(` Cat-only: ${catOnly.length.toLocaleString()}`);
console.log(` Spec-only: ${specOnly.length.toLocaleString()}`);
console.log(` Both: ${bothDisputed.length.toLocaleString()}`);
// ════════════════════════════════════════════════════════════════════════
// 1. CATEGORY x SPECIFICITY CROSS-TAB FOR DISPUTED PARAGRAPHS
// ════════════════════════════════════════════════════════════════════════
console.log("\n\n══════════════════════════════════════════════════════════════");
console.log(" 1. CATEGORY x SPECIFICITY CROSS-TAB (disputed paragraphs)");
console.log("══════════════════════════════════════════════════════════════");
console.log(" Uses majority-vote labels for both axes.\n");
// Collect all categories from majority votes
const catCounts = new Map<string, number>();
for (const p of disputed) {
if (p.majCat) catCounts.set(p.majCat, (catCounts.get(p.majCat) ?? 0) + 1);
}
const categories = [...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([c]) => c);
const specLevels = [1, 2, 3, 4];
const specLabels = ["GenBoiler", "SectorAdpt", "FirmSpec", "QuantVerif"];
// Build the cross-tab
const crossTab = new Map<string, number>();
let noMajCat = 0, noMajSpec = 0;
for (const p of disputed) {
if (!p.majCat || p.majSpec === null) {
if (!p.majCat) noMajCat++;
if (p.majSpec === null) noMajSpec++;
continue;
}
const key = `${p.majCat}|${p.majSpec}`;
crossTab.set(key, (crossTab.get(key) ?? 0) + 1);
}
// Print matrix
const colW = 12;
const catW = 28;
let header = "Category".padEnd(catW);
for (let i = 0; i < specLevels.length; i++) {
header += `${specLevels[i]}:${specLabels[i]}`.padStart(colW);
}
header += "Total".padStart(colW);
console.log(` ${header}`);
console.log(` ${"─".repeat(header.length)}`);
for (const cat of categories) {
let rowTotal = 0;
const cells: string[] = [];
for (const s of specLevels) {
const v = crossTab.get(`${cat}|${s}`) ?? 0;
rowTotal += v;
cells.push(v.toString());
}
let row = cat.padEnd(catW);
for (let i = 0; i < cells.length; i++) {
const v = parseInt(cells[i]);
const rowPct = rowTotal > 0 ? ((v / rowTotal) * 100).toFixed(0) : "0";
row += `${v} (${rowPct}%)`.padStart(colW);
}
row += `${rowTotal}`.padStart(colW);
console.log(` ${row}`);
}
console.log(`\n (${noMajCat} paragraphs had no majority category, ${noMajSpec} had no majority specificity)`);
// ════════════════════════════════════════════════════════════════════════
// 2. DISPUTE TYPE BY CATEGORY
// ════════════════════════════════════════════════════════════════════════
console.log("\n\n══════════════════════════════════════════════════════════════");
console.log(" 2. DISPUTE TYPE BY MAJORITY CATEGORY");
console.log("══════════════════════════════════════════════════════════════");
console.log(" For each majority category, % of disputes that are cat-only, spec-only, or both.\n");
// Group disputed paragraphs by majority category
const disputeByCat = new Map<string, { catOnly: number; specOnly: number; both: number }>();
for (const p of disputed) {
const cat = p.majCat ?? "[no majority]";
if (!disputeByCat.has(cat)) disputeByCat.set(cat, { catOnly: 0, specOnly: 0, both: 0 });
const entry = disputeByCat.get(cat)!;
if (p.disputeType === "cat-only") entry.catOnly++;
else if (p.disputeType === "spec-only") entry.specOnly++;
else if (p.disputeType === "both") entry.both++;
}
const dHeader = "Category".padEnd(catW) + "n".padStart(8) + "Cat-only".padStart(12) + "Spec-only".padStart(12) + "Both".padStart(12);
console.log(` ${dHeader}`);
console.log(` ${"─".repeat(dHeader.length)}`);
const sortedDispCats = [...disputeByCat.entries()].sort((a, b) => {
const totalA = a[1].catOnly + a[1].specOnly + a[1].both;
const totalB = b[1].catOnly + b[1].specOnly + b[1].both;
return totalB - totalA;
});
for (const [cat, d] of sortedDispCats) {
const total = d.catOnly + d.specOnly + d.both;
const row = cat.padEnd(catW) +
total.toString().padStart(8) +
`${d.catOnly} (${pct(d.catOnly, total)})`.padStart(12) +
`${d.specOnly} (${pct(d.specOnly, total)})`.padStart(12) +
`${d.both} (${pct(d.both, total)})`.padStart(12);
console.log(` ${row}`);
}
// ════════════════════════════════════════════════════════════════════════
// 3. SPEC BOUNDARY DISPUTES BY CATEGORY
// ════════════════════════════════════════════════════════════════════════
console.log("\n\n══════════════════════════════════════════════════════════════");
console.log(" 3. SPECIFICITY BOUNDARY DISPUTES BY CATEGORY");
console.log("══════════════════════════════════════════════════════════════");
console.log(" For spec-disputed paragraphs, the spec vote pattern by majority category.\n");
// Group by majority category, then count spec patterns
const specPatternByCat = new Map<string, Map<string, number>>();
const specDisputed = allParas.filter(p => p.specDisputed);
for (const p of specDisputed) {
const cat = p.majCat ?? "[no majority]";
if (!specPatternByCat.has(cat)) specPatternByCat.set(cat, new Map());
const patternMap = specPatternByCat.get(cat)!;
// Show the unique values sorted as the boundary pattern
const uniqSorted = [...new Set(p.specs)].sort((a, b) => a - b);
const pattern = `[${uniqSorted.join(",")}]`;
patternMap.set(pattern, (patternMap.get(pattern) ?? 0) + 1);
}
// Collect all patterns
const allPatterns = new Set<string>();
for (const pm of specPatternByCat.values()) {
for (const pat of pm.keys()) allPatterns.add(pat);
}
const sortedPatterns = [...allPatterns].sort();
// Print header
const patW = 10;
let pHeader = "Category".padEnd(catW) + "n".padStart(6);
for (const pat of sortedPatterns) {
pHeader += pat.padStart(patW);
}
console.log(` ${pHeader}`);
console.log(` ${"─".repeat(pHeader.length)}`);
const specPatCats = [...specPatternByCat.entries()].sort((a, b) => {
let totalA = 0, totalB = 0;
for (const v of a[1].values()) totalA += v;
for (const v of b[1].values()) totalB += v;
return totalB - totalA;
});
for (const [cat, pm] of specPatCats) {
let total = 0;
for (const v of pm.values()) total += v;
let row = cat.padEnd(catW) + total.toString().padStart(6);
for (const pat of sortedPatterns) {
const v = pm.get(pat) ?? 0;
if (v === 0) {
row += "-".padStart(patW);
} else {
row += `${v}`.padStart(patW);
}
}
console.log(` ${row}`);
}
// Also show with percentages within each category
console.log("\n (Row percentages:)");
let pHeader2 = "Category".padEnd(catW) + "n".padStart(6);
for (const pat of sortedPatterns) {
pHeader2 += pat.padStart(patW);
}
console.log(` ${pHeader2}`);
console.log(` ${"─".repeat(pHeader2.length)}`);
for (const [cat, pm] of specPatCats) {
let total = 0;
for (const v of pm.values()) total += v;
let row = cat.padEnd(catW) + total.toString().padStart(6);
for (const pat of sortedPatterns) {
const v = pm.get(pat) ?? 0;
if (v === 0) {
row += "-".padStart(patW);
} else {
row += `${((v / total) * 100).toFixed(0)}%`.padStart(patW);
}
}
console.log(` ${row}`);
}
// ════════════════════════════════════════════════════════════════════════
// 4. WORD COUNT DISTRIBUTION BY DISPUTE TYPE
// ════════════════════════════════════════════════════════════════════════
console.log("\n\n══════════════════════════════════════════════════════════════");
console.log(" 4. WORD COUNT DISTRIBUTION BY DISPUTE TYPE");
console.log("══════════════════════════════════════════════════════════════\n");
const groups: { label: string; paras: ParaInfo[] }[] = [
{ label: "Unanimous (no dispute)", paras: allParas.filter(p => p.disputeType === "none") },
{ label: "Cat-only dispute", paras: catOnly },
{ label: "Spec-only dispute", paras: specOnly },
{ label: "Both disputed", paras: bothDisputed },
];
const wcHeader = "Dispute Type".padEnd(28) + "n".padStart(8) + "Median".padStart(10) + "P90".padStart(10) + "P10".padStart(10) + "Mean".padStart(10);
console.log(` ${wcHeader}`);
console.log(` ${"─".repeat(wcHeader.length)}`);
for (const g of groups) {
const wcs = g.paras.map(p => p.wordCount).filter(w => w > 0);
if (wcs.length === 0) continue;
const row = g.label.padEnd(28) +
wcs.length.toString().padStart(8) +
median(wcs).toFixed(0).padStart(10) +
percentile(wcs, 90).toFixed(0).padStart(10) +
percentile(wcs, 10).toFixed(0).padStart(10) +
(wcs.reduce((a, b) => a + b, 0) / wcs.length).toFixed(0).padStart(10);
console.log(` ${row}`);
}
// ════════════════════════════════════════════════════════════════════════
// 5. UNRESOLVED PARAGRAPH ANALYSIS (3-WAY SPLITS)
// ════════════════════════════════════════════════════════════════════════
console.log("\n\n══════════════════════════════════════════════════════════════");
console.log(" 5. UNRESOLVED PARAGRAPH ANALYSIS (3-way category splits)");
console.log("══════════════════════════════════════════════════════════════\n");
const unresolved = allParas.filter(p => p.majCat === null);
console.log(` Total unresolved paragraphs: ${unresolved.length.toLocaleString()}`);
// Category representations in unresolved
const unresolvedCatFreq = new Map<string, number>();
for (const p of unresolved) {
for (const c of p.cats) {
unresolvedCatFreq.set(c, (unresolvedCatFreq.get(c) ?? 0) + 1);
}
}
console.log("\n Categories appearing in unresolved paragraphs (annotation count):");
const sortedUnresCats = [...unresolvedCatFreq.entries()].sort((a, b) => b[1] - a[1]);
for (const [cat, count] of sortedUnresCats) {
console.log(` ${count.toString().padStart(6)} ${cat}`);
}
// Specificity in unresolved
const unresolvedSpecFreq = new Map<number, number>();
for (const p of unresolved) {
for (const s of p.specs) {
unresolvedSpecFreq.set(s, (unresolvedSpecFreq.get(s) ?? 0) + 1);
}
}
console.log("\n Specificity levels in unresolved paragraphs (annotation count):");
for (let s = 1; s <= 4; s++) {
const count = unresolvedSpecFreq.get(s) ?? 0;
console.log(` ${count.toString().padStart(6)} ${s}`);
}
// Most common 3-way category splits
const threewayPatterns = new Map<string, number>();
for (const p of unresolved) {
const sorted = [...p.cats].sort();
const key = sorted.join(" / ");
threewayPatterns.set(key, (threewayPatterns.get(key) ?? 0) + 1);
}
console.log("\n Most common 3-way category splits:");
const sortedThreeWay = [...threewayPatterns.entries()].sort((a, b) => b[1] - a[1]);
for (const [pattern, count] of sortedThreeWay.slice(0, 20)) {
console.log(` ${count.toString().padStart(6)} ${pattern}`);
}
if (sortedThreeWay.length > 20) {
console.log(` ... and ${sortedThreeWay.length - 20} more patterns`);
}
// Specificity agreement among unresolved
const unresolvedSpecUnanimous = unresolved.filter(p => p.specUnanimous).length;
const unresolvedSpecMaj = unresolved.filter(p => p.majSpec !== null).length;
console.log(`\n Specificity agreement among unresolved:`);
console.log(` Spec unanimous: ${unresolvedSpecUnanimous} (${pct(unresolvedSpecUnanimous, unresolved.length)})`);
console.log(` Spec majority: ${unresolvedSpecMaj} (${pct(unresolvedSpecMaj, unresolved.length)})`);
console.log(` Spec 3-way: ${unresolved.length - unresolvedSpecMaj} (${pct(unresolved.length - unresolvedSpecMaj, unresolved.length)})`);
// ════════════════════════════════════════════════════════════════════════
// 6. "BOTH" DISPUTES — COMBINED PATTERNS
// ════════════════════════════════════════════════════════════════════════
console.log("\n\n══════════════════════════════════════════════════════════════");
console.log(" 6. 'BOTH' DISPUTES — COMBINED CATEGORY + SPECIFICITY PATTERNS");
console.log("══════════════════════════════════════════════════════════════\n");
console.log(` Total paragraphs with both cat AND spec disputed: ${bothDisputed.length.toLocaleString()}\n`);
// For each, compute the category dispute pair + spec boundary
const combinedPatterns = new Map<string, number>();
for (const p of bothDisputed) {
// Category dispute description
const catUniq = uniqueSorted(p.cats);
let catPart: string;
if (catUniq.length === 2) {
// 2-1 split: show as "A<->B"
catPart = `${catUniq[0]}${catUniq[1]}`;
} else {
// 3-way
catPart = catUniq.join("/");
}
// Spec dispute description
const specUniq = [...new Set(p.specs)].sort((a, b) => a - b);
const specPart = `[${specUniq.join(",")}]`;
const combined = `${catPart} + ${specPart}`;
combinedPatterns.set(combined, (combinedPatterns.get(combined) ?? 0) + 1);
}
const sortedCombined = [...combinedPatterns.entries()].sort((a, b) => b[1] - a[1]);
console.log(" Top 30 combined dispute patterns:");
for (const [pattern, count] of sortedCombined.slice(0, 30)) {
console.log(` ${count.toString().padStart(6)} ${pattern}`);
}
if (sortedCombined.length > 30) {
console.log(`\n ... and ${sortedCombined.length - 30} more patterns (${sortedCombined.slice(30).reduce((a, b) => a + b[1], 0)} paragraphs)`);
}
// Also summarize by just the category pair (aggregating across spec patterns)
console.log("\n Category dispute pairs (aggregated across spec patterns):");
const catPairAgg = new Map<string, number>();
for (const p of bothDisputed) {
const catUniq = uniqueSorted(p.cats);
let catPart: string;
if (catUniq.length === 2) {
catPart = `${catUniq[0]}${catUniq[1]}`;
} else {
catPart = catUniq.join("/");
}
catPairAgg.set(catPart, (catPairAgg.get(catPart) ?? 0) + 1);
}
const sortedCatPairs = [...catPairAgg.entries()].sort((a, b) => b[1] - a[1]);
for (const [pair, count] of sortedCatPairs.slice(0, 20)) {
console.log(` ${count.toString().padStart(6)} ${pair}`);
}
console.log("\n Spec boundary patterns within 'both' disputes:");
const specPatAgg = new Map<string, number>();
for (const p of bothDisputed) {
const specUniq = [...new Set(p.specs)].sort((a, b) => a - b);
const specPart = `[${specUniq.join(",")}]`;
specPatAgg.set(specPart, (specPatAgg.get(specPart) ?? 0) + 1);
}
const sortedSpecPats = [...specPatAgg.entries()].sort((a, b) => b[1] - a[1]);
for (const [pat, count] of sortedSpecPats) {
console.log(` ${count.toString().padStart(6)} ${pat} (${pct(count, bothDisputed.length)})`);
}
console.log("\n═══════════════════════════════════════════════════════════════════");
console.log(" ANALYSIS COMPLETE");
console.log("═══════════════════════════════════════════════════════════════════");
}
main().catch(err => { console.error(err); process.exit(1); });