SEC-cyBERT/ts/scripts/segment-analysis.ts
2026-03-28 23:44:37 -04:00

433 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Cross-tabulate agreement status against all paragraph metadata dimensions.
*
* Segments every paragraph into: unanimous | majority | unresolved
* Then breaks down by: fiscal year, filing type, sec item, category,
* specificity, confidence, company size (paragraph count proxy),
* word count quintile, and cross-dimensions.
*
* Usage: bun ts/scripts/segment-analysis.ts
*/
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
// ── Types ──────────────────────────────────────────────────────────────
interface Ann {
paragraphId: string;
label: {
content_category: string;
specificity_level: number;
category_confidence: string;
specificity_confidence: string;
reasoning: string;
};
provenance: {
modelId: string;
costUsd: number;
inputTokens: number;
outputTokens: number;
reasoningTokens: number;
latencyMs: number;
requestedAt: string;
};
}
type Segment = "unanimous" | "majority" | "unresolved";
interface ParagraphAnalysis {
id: string;
segment: Segment;
catSegment: "cat-unanimous" | "cat-majority" | "cat-split";
specSegment: "spec-unanimous" | "spec-majority" | "spec-split";
majorityCat: string;
majoritySpec: number;
cats: string[];
specs: number[];
catConfidences: string[];
specConfidences: string[];
// Filing metadata
companyName: string;
ticker: string;
filingType: string;
filingDate: string;
fiscalYear: number;
secItem: string;
wordCount: number;
}
// ── Helpers ────────────────────────────────────────────────────────────
function pct(n: number, total: number): string {
return total === 0 ? "0.0%" : `${((n / total) * 100).toFixed(1)}%`;
}
function majority<T>(arr: T[]): { value: T; count: number } | null {
const counts = new Map<T, number>();
for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
let best: T | null = null;
let bestCount = 0;
for (const [v, c] of counts) {
if (c > bestCount) { best = v; bestCount = c; }
}
return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
}
function printDistribution(label: string, counts: Map<string, { total: number; unanimous: number; majority: number; unresolved: number }>) {
console.log(`\n${"═".repeat(70)}`);
console.log(` ${label}`);
console.log(`${"═".repeat(70)}`);
const sorted = [...counts.entries()].sort(([, a], [, b]) => b.total - a.total);
const maxKeyLen = Math.max(...sorted.map(([k]) => k.length), 20);
console.log(
` ${"".padEnd(maxKeyLen)} ${"Total".padStart(7)} ${"Unan".padStart(7)} ${"Maj".padStart(7)} ${"Unres".padStart(7)} ${"Unan%".padStart(7)} ${"Unres%".padStart(7)}`
);
console.log(` ${"─".repeat(maxKeyLen + 50)}`);
for (const [key, v] of sorted) {
console.log(
` ${key.padEnd(maxKeyLen)} ${String(v.total).padStart(7)} ${String(v.unanimous).padStart(7)} ${String(v.majority).padStart(7)} ${String(v.unresolved).padStart(7)} ${pct(v.unanimous, v.total).padStart(7)} ${pct(v.unresolved, v.total).padStart(7)}`
);
}
}
function printCrossTab(label: string, rows: Map<string, Map<string, number>>, colOrder?: string[]) {
console.log(`\n${"═".repeat(70)}`);
console.log(` ${label}`);
console.log(`${"═".repeat(70)}`);
const allCols = colOrder ?? [...new Set([...rows.values()].flatMap(m => [...m.keys()]))].sort();
const maxKeyLen = Math.max(...[...rows.keys()].map(k => k.length), 15);
const colWidth = 8;
console.log(
` ${"".padEnd(maxKeyLen)} ${allCols.map(c => c.slice(0, colWidth).padStart(colWidth)).join(" ")}`
);
console.log(` ${"─".repeat(maxKeyLen + (colWidth + 2) * allCols.length)}`);
const sortedRows = [...rows.entries()].sort(([a], [b]) => a.localeCompare(b));
for (const [key, cols] of sortedRows) {
const total = [...cols.values()].reduce((a, b) => a + b, 0);
const cells = allCols.map(c => {
const n = cols.get(c) ?? 0;
return `${pct(n, total)}`.padStart(colWidth);
});
console.log(` ${key.padEnd(maxKeyLen)} ${cells.join(" ")} (n=${total})`);
}
}
// ── Main ───────────────────────────────────────────────────────────────
async function main() {
console.log("Loading paragraphs...");
const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
console.log(` ${paragraphs.length.toLocaleString()} paragraphs`);
const paraById = new Map(paragraphs.map(p => [p.id, p]));
console.log("Loading annotations...");
const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
const anns = rawAnns as Ann[];
console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);
// Group annotations by paragraph
const byParagraph = new Map<string, Ann[]>();
for (const a of anns) {
let arr = byParagraph.get(a.paragraphId);
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
arr.push(a);
}
// Count paragraphs per company (for company-size bucketing)
const companyParaCount = new Map<string, number>();
for (const p of paragraphs) {
const name = p.filing.companyName;
companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
}
// ── Analyze each paragraph ──────────────────────────────────────────
const analyzed: ParagraphAnalysis[] = [];
for (const [pid, pannAnns] of byParagraph) {
if (pannAnns.length < 3) continue;
const para = paraById.get(pid);
if (!para) continue;
const cats = pannAnns.map(a => a.label.content_category);
const specs = pannAnns.map(a => a.label.specificity_level);
const catConfidences = pannAnns.map(a => a.label.category_confidence);
const specConfidences = pannAnns.map(a => a.label.specificity_confidence);
const catMaj = majority(cats);
const specMaj = majority(specs);
const catSeg = cats.every(c => c === cats[0]) ? "cat-unanimous"
: catMaj ? "cat-majority" : "cat-split";
const specSeg = specs.every(s => s === specs[0]) ? "spec-unanimous"
: specMaj ? "spec-majority" : "spec-split";
let segment: Segment;
if (catSeg === "cat-unanimous" && specSeg === "spec-unanimous") {
segment = "unanimous";
} else if (catMaj && specMaj) {
segment = "majority";
} else {
segment = "unresolved";
}
analyzed.push({
id: pid,
segment,
catSegment: catSeg,
specSegment: specSeg,
majorityCat: catMaj?.value ?? cats[0],
majoritySpec: specMaj?.value ?? specs[0],
cats,
specs,
catConfidences,
specConfidences,
companyName: para.filing.companyName,
ticker: para.filing.ticker,
filingType: para.filing.filingType,
filingDate: para.filing.filingDate,
fiscalYear: para.filing.fiscalYear,
secItem: para.filing.secItem,
wordCount: para.wordCount,
});
}
console.log(`\n${analyzed.length.toLocaleString()} paragraphs analyzed\n`);
// ── Overview ─────────────────────────────────────────────────────────
const segCounts = { unanimous: 0, majority: 0, unresolved: 0 };
for (const a of analyzed) segCounts[a.segment]++;
console.log("SEGMENT OVERVIEW:");
console.log(` Unanimous: ${segCounts.unanimous.toLocaleString()} (${pct(segCounts.unanimous, analyzed.length)})`);
console.log(` Majority: ${segCounts.majority.toLocaleString()} (${pct(segCounts.majority, analyzed.length)})`);
console.log(` Unresolved: ${segCounts.unresolved.toLocaleString()} (${pct(segCounts.unresolved, analyzed.length)})`);
// Cat vs spec disagreement breakdown
const catSpecBreakdown = { catOnly: 0, specOnly: 0, both: 0 };
for (const a of analyzed) {
if (a.segment === "unanimous") continue;
const catDis = a.catSegment !== "cat-unanimous";
const specDis = a.specSegment !== "spec-unanimous";
if (catDis && specDis) catSpecBreakdown.both++;
else if (catDis) catSpecBreakdown.catOnly++;
else catSpecBreakdown.specOnly++;
}
const disputed = segCounts.majority + segCounts.unresolved;
console.log(`\n Disagreement breakdown (of ${disputed.toLocaleString()} non-unanimous):`);
console.log(` Category only: ${catSpecBreakdown.catOnly.toLocaleString()} (${pct(catSpecBreakdown.catOnly, disputed)})`);
console.log(` Specificity only: ${catSpecBreakdown.specOnly.toLocaleString()} (${pct(catSpecBreakdown.specOnly, disputed)})`);
console.log(` Both: ${catSpecBreakdown.both.toLocaleString()} (${pct(catSpecBreakdown.both, disputed)})`);
// ── Distribution functions ──────────────────────────────────────────
function buildDist(keyFn: (a: ParagraphAnalysis) => string) {
const dist = new Map<string, { total: number; unanimous: number; majority: number; unresolved: number }>();
for (const a of analyzed) {
const key = keyFn(a);
let entry = dist.get(key);
if (!entry) { entry = { total: 0, unanimous: 0, majority: 0, unresolved: 0 }; dist.set(key, entry); }
entry.total++;
entry[a.segment]++;
}
return dist;
}
// ── 1. By fiscal year ───────────────────────────────────────────────
printDistribution("BY FISCAL YEAR", buildDist(a => String(a.fiscalYear)));
// ── 2. By filing type ───────────────────────────────────────────────
printDistribution("BY FILING TYPE", buildDist(a => a.filingType));
// ── 3. By SEC item ──────────────────────────────────────────────────
printDistribution("BY SEC ITEM", buildDist(a => a.secItem));
// ── 4. By majority category ─────────────────────────────────────────
printDistribution("BY MAJORITY CATEGORY", buildDist(a => a.majorityCat));
// ── 5. By majority specificity ──────────────────────────────────────
const specLabels: Record<number, string> = {
1: "1-Generic", 2: "2-Sector", 3: "3-Firm", 4: "4-Quantified"
};
printDistribution("BY MAJORITY SPECIFICITY", buildDist(a => specLabels[a.majoritySpec] ?? String(a.majoritySpec)));
// ── 6. By confidence pattern ────────────────────────────────────────
printDistribution("BY CATEGORY CONFIDENCE PATTERN",
buildDist(a => a.catConfidences.sort().join("/")));
printDistribution("BY SPECIFICITY CONFIDENCE PATTERN",
buildDist(a => a.specConfidences.sort().join("/")));
// ── 7. By word count quintile ───────────────────────────────────────
const wordCounts = analyzed.map(a => a.wordCount).sort((a, b) => a - b);
const q20 = wordCounts[Math.floor(wordCounts.length * 0.2)];
const q40 = wordCounts[Math.floor(wordCounts.length * 0.4)];
const q60 = wordCounts[Math.floor(wordCounts.length * 0.6)];
const q80 = wordCounts[Math.floor(wordCounts.length * 0.8)];
console.log(`\n Word count quintile boundaries: ${q20}, ${q40}, ${q60}, ${q80}`);
printDistribution("BY WORD COUNT QUINTILE", buildDist(a => {
if (a.wordCount <= q20) return `Q1 (≤${q20})`;
if (a.wordCount <= q40) return `Q2 (${q20+1}-${q40})`;
if (a.wordCount <= q60) return `Q3 (${q40+1}-${q60})`;
if (a.wordCount <= q80) return `Q4 (${q60+1}-${q80})`;
return `Q5 (>${q80})`;
}));
// ── 8. By company size bucket ───────────────────────────────────────
printDistribution("BY COMPANY SIZE (paragraph count)", buildDist(a => {
const n = companyParaCount.get(a.companyName) ?? 0;
if (n <= 3) return "1-3 paras";
if (n <= 6) return "4-6 paras";
if (n <= 10) return "7-10 paras";
if (n <= 20) return "11-20 paras";
return "21+ paras";
}));
// ── 9. Cross-tab: category × segment ────────────────────────────────
const catBySegment = new Map<string, Map<string, number>>();
for (const a of analyzed) {
const key = a.majorityCat;
let row = catBySegment.get(key);
if (!row) { row = new Map(); catBySegment.set(key, row); }
row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
}
printCrossTab("CATEGORY × SEGMENT", catBySegment, ["unanimous", "majority", "unresolved"]);
// ── 10. Cross-tab: specificity × segment ────────────────────────────
const specBySegment = new Map<string, Map<string, number>>();
for (const a of analyzed) {
const key = specLabels[a.majoritySpec] ?? String(a.majoritySpec);
let row = specBySegment.get(key);
if (!row) { row = new Map(); specBySegment.set(key, row); }
row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
}
printCrossTab("SPECIFICITY × SEGMENT", specBySegment, ["unanimous", "majority", "unresolved"]);
// ── 11. Cross-tab: fiscal year × category (for non-unanimous) ──────
const yearByCat = new Map<string, Map<string, number>>();
for (const a of analyzed) {
if (a.segment === "unanimous") continue;
const key = String(a.fiscalYear);
let row = yearByCat.get(key);
if (!row) { row = new Map(); yearByCat.set(key, row); }
row.set(a.majorityCat, (row.get(a.majorityCat) ?? 0) + 1);
}
printCrossTab("FISCAL YEAR × CATEGORY (non-unanimous only)", yearByCat);
// ── 12. Top disagreement companies ──────────────────────────────────
const companyDisagree = new Map<string, { total: number; disputed: number }>();
for (const a of analyzed) {
let entry = companyDisagree.get(a.companyName);
if (!entry) { entry = { total: 0, disputed: 0 }; companyDisagree.set(a.companyName, entry); }
entry.total++;
if (a.segment !== "unanimous") entry.disputed++;
}
console.log(`\n${"═".repeat(70)}`);
console.log(" TOP 30 COMPANIES BY DISAGREEMENT RATE (min 5 paragraphs)");
console.log(`${"═".repeat(70)}`);
const companyRanked = [...companyDisagree.entries()]
.filter(([, v]) => v.total >= 5)
.map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
.sort((a, b) => b.rate - a.rate)
.slice(0, 30);
for (const c of companyRanked) {
console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
}
// ── 13. Bottom 30 companies (lowest disagreement) ──────────────────
console.log(`\n${"═".repeat(70)}`);
console.log(" TOP 30 COMPANIES BY AGREEMENT RATE (min 5 paragraphs)");
console.log(`${"═".repeat(70)}`);
const companyAgreed = [...companyDisagree.entries()]
.filter(([, v]) => v.total >= 5)
.map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
.sort((a, b) => a.rate - b.rate)
.slice(0, 30);
for (const c of companyAgreed) {
console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
}
// ── 14. Specificity spread analysis ─────────────────────────────────
console.log(`\n${"═".repeat(70)}`);
console.log(" SPECIFICITY SPREAD (max - min) FOR NON-UNANIMOUS");
console.log(`${"═".repeat(70)}`);
const specSpread = new Map<string, number>();
for (const a of analyzed) {
if (a.specSegment === "spec-unanimous") continue;
const spread = Math.max(...a.specs) - Math.min(...a.specs);
const key = `spread-${spread}`;
specSpread.set(key, (specSpread.get(key) ?? 0) + 1);
}
for (const [key, count] of [...specSpread.entries()].sort()) {
console.log(` ${key}: ${count.toLocaleString()}`);
}
// ── 15. Most common category dispute pairs ──────────────────────────
console.log(`\n${"═".repeat(70)}`);
console.log(" TOP CATEGORY DISPUTE PAIRS");
console.log(`${"═".repeat(70)}`);
const catPairs = new Map<string, number>();
for (const a of analyzed) {
if (a.catSegment === "cat-unanimous") continue;
const sorted = [...new Set(a.cats)].sort();
const key = sorted.join(" ↔ ");
catPairs.set(key, (catPairs.get(key) ?? 0) + 1);
}
for (const [pair, count] of [...catPairs.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
console.log(` ${pair.padEnd(55)} ${count.toLocaleString()}`);
}
// ── 16. Spec dispute patterns ───────────────────────────────────────
console.log(`\n${"═".repeat(70)}`);
console.log(" TOP SPECIFICITY DISPUTE PATTERNS");
console.log(`${"═".repeat(70)}`);
const specPatterns = new Map<string, number>();
for (const a of analyzed) {
if (a.specSegment === "spec-unanimous") continue;
const sorted = [...a.specs].sort((a, b) => a - b);
const key = `[${sorted.join(",")}]`;
specPatterns.set(key, (specPatterns.get(key) ?? 0) + 1);
}
for (const [pattern, count] of [...specPatterns.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
console.log(` ${pattern.padEnd(20)} ${count.toLocaleString()}`);
}
// ── 17. Confidence vs agreement rate ────────────────────────────────
console.log(`\n${"═".repeat(70)}`);
console.log(" AVERAGE CONFIDENCE BY SEGMENT");
console.log(`${"═".repeat(70)}`);
const confScore = (c: string) => c === "high" ? 3 : c === "medium" ? 2 : 1;
for (const seg of ["unanimous", "majority", "unresolved"] as const) {
const group = analyzed.filter(a => a.segment === seg);
const avgCatConf = group.reduce((s, a) => s + a.catConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
const avgSpecConf = group.reduce((s, a) => s + a.specConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
console.log(` ${seg.padEnd(12)} avg cat conf: ${avgCatConf.toFixed(2)} avg spec conf: ${avgSpecConf.toFixed(2)}`);
}
// ── 18. All-low-confidence counts ───────────────────────────────────
console.log(`\n${"═".repeat(70)}`);
console.log(" ALL-LOW-CONFIDENCE PATTERNS");
console.log(`${"═".repeat(70)}`);
const allLowCat = analyzed.filter(a => a.catConfidences.every(c => c === "low"));
const allLowSpec = analyzed.filter(a => a.specConfidences.every(c => c === "low"));
const allLowBoth = analyzed.filter(a => a.catConfidences.every(c => c === "low") && a.specConfidences.every(c => c === "low"));
console.log(` All-low cat confidence: ${allLowCat.length} (${pct(allLowCat.length, analyzed.length)})`);
console.log(` All-low spec confidence: ${allLowSpec.length} (${pct(allLowSpec.length, analyzed.length)})`);
console.log(` All-low both: ${allLowBoth.length} (${pct(allLowBoth.length, analyzed.length)})`);
// Of those, segment distribution
for (const [label, group] of [["All-low cat", allLowCat], ["All-low spec", allLowSpec]] as const) {
const segDist = { unanimous: 0, majority: 0, unresolved: 0 };
for (const a of group) segDist[a.segment]++;
console.log(` ${label} → unanimous: ${segDist.unanimous}, majority: ${segDist.majority}, unresolved: ${segDist.unresolved}`);
}
}
main().catch(console.error);