/** * Cross-tabulate agreement status against all paragraph metadata dimensions. * * Segments every paragraph into: unanimous | majority | unresolved * Then breaks down by: fiscal year, filing type, sec item, category, * specificity, confidence, company size (paragraph count proxy), * word count quintile, and cross-dimensions. * * Usage: bun ts/scripts/segment-analysis.ts */ import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts"; import { Paragraph } from "../src/schemas/paragraph.ts"; const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname; const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname; // ── Types ────────────────────────────────────────────────────────────── interface Ann { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string; reasoning: string; }; provenance: { modelId: string; costUsd: number; inputTokens: number; outputTokens: number; reasoningTokens: number; latencyMs: number; requestedAt: string; }; } type Segment = "unanimous" | "majority" | "unresolved"; interface ParagraphAnalysis { id: string; segment: Segment; catSegment: "cat-unanimous" | "cat-majority" | "cat-split"; specSegment: "spec-unanimous" | "spec-majority" | "spec-split"; majorityCat: string; majoritySpec: number; cats: string[]; specs: number[]; catConfidences: string[]; specConfidences: string[]; // Filing metadata companyName: string; ticker: string; filingType: string; filingDate: string; fiscalYear: number; secItem: string; wordCount: number; } // ── Helpers ──────────────────────────────────────────────────────────── function pct(n: number, total: number): string { return total === 0 ? "0.0%" : `${((n / total) * 100).toFixed(1)}%`; } function majority(arr: T[]): { value: T; count: number } | null { const counts = new Map(); for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1); let best: T | null = null; let bestCount = 0; for (const [v, c] of counts) { if (c > bestCount) { best = v; bestCount = c; } } return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null; } function printDistribution(label: string, counts: Map) { console.log(`\n${"═".repeat(70)}`); console.log(` ${label}`); console.log(`${"═".repeat(70)}`); const sorted = [...counts.entries()].sort(([, a], [, b]) => b.total - a.total); const maxKeyLen = Math.max(...sorted.map(([k]) => k.length), 20); console.log( ` ${"".padEnd(maxKeyLen)} ${"Total".padStart(7)} ${"Unan".padStart(7)} ${"Maj".padStart(7)} ${"Unres".padStart(7)} ${"Unan%".padStart(7)} ${"Unres%".padStart(7)}` ); console.log(` ${"─".repeat(maxKeyLen + 50)}`); for (const [key, v] of sorted) { console.log( ` ${key.padEnd(maxKeyLen)} ${String(v.total).padStart(7)} ${String(v.unanimous).padStart(7)} ${String(v.majority).padStart(7)} ${String(v.unresolved).padStart(7)} ${pct(v.unanimous, v.total).padStart(7)} ${pct(v.unresolved, v.total).padStart(7)}` ); } } function printCrossTab(label: string, rows: Map>, colOrder?: string[]) { console.log(`\n${"═".repeat(70)}`); console.log(` ${label}`); console.log(`${"═".repeat(70)}`); const allCols = colOrder ?? [...new Set([...rows.values()].flatMap(m => [...m.keys()]))].sort(); const maxKeyLen = Math.max(...[...rows.keys()].map(k => k.length), 15); const colWidth = 8; console.log( ` ${"".padEnd(maxKeyLen)} ${allCols.map(c => c.slice(0, colWidth).padStart(colWidth)).join(" ")}` ); console.log(` ${"─".repeat(maxKeyLen + (colWidth + 2) * allCols.length)}`); const sortedRows = [...rows.entries()].sort(([a], [b]) => a.localeCompare(b)); for (const [key, cols] of sortedRows) { const total = [...cols.values()].reduce((a, b) => a + b, 0); const cells = allCols.map(c => { const n = cols.get(c) ?? 0; return `${pct(n, total)}`.padStart(colWidth); }); console.log(` ${key.padEnd(maxKeyLen)} ${cells.join(" ")} (n=${total})`); } } // ── Main ─────────────────────────────────────────────────────────────── async function main() { console.log("Loading paragraphs..."); const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph); console.log(` ${paragraphs.length.toLocaleString()} paragraphs`); const paraById = new Map(paragraphs.map(p => [p.id, p])); console.log("Loading annotations..."); const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS); const anns = rawAnns as Ann[]; console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`); // Group annotations by paragraph const byParagraph = new Map(); for (const a of anns) { let arr = byParagraph.get(a.paragraphId); if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); } arr.push(a); } // Count paragraphs per company (for company-size bucketing) const companyParaCount = new Map(); for (const p of paragraphs) { const name = p.filing.companyName; companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1); } // ── Analyze each paragraph ────────────────────────────────────────── const analyzed: ParagraphAnalysis[] = []; for (const [pid, pannAnns] of byParagraph) { if (pannAnns.length < 3) continue; const para = paraById.get(pid); if (!para) continue; const cats = pannAnns.map(a => a.label.content_category); const specs = pannAnns.map(a => a.label.specificity_level); const catConfidences = pannAnns.map(a => a.label.category_confidence); const specConfidences = pannAnns.map(a => a.label.specificity_confidence); const catMaj = majority(cats); const specMaj = majority(specs); const catSeg = cats.every(c => c === cats[0]) ? "cat-unanimous" : catMaj ? "cat-majority" : "cat-split"; const specSeg = specs.every(s => s === specs[0]) ? "spec-unanimous" : specMaj ? "spec-majority" : "spec-split"; let segment: Segment; if (catSeg === "cat-unanimous" && specSeg === "spec-unanimous") { segment = "unanimous"; } else if (catMaj && specMaj) { segment = "majority"; } else { segment = "unresolved"; } analyzed.push({ id: pid, segment, catSegment: catSeg, specSegment: specSeg, majorityCat: catMaj?.value ?? cats[0], majoritySpec: specMaj?.value ?? specs[0], cats, specs, catConfidences, specConfidences, companyName: para.filing.companyName, ticker: para.filing.ticker, filingType: para.filing.filingType, filingDate: para.filing.filingDate, fiscalYear: para.filing.fiscalYear, secItem: para.filing.secItem, wordCount: para.wordCount, }); } console.log(`\n${analyzed.length.toLocaleString()} paragraphs analyzed\n`); // ── Overview ───────────────────────────────────────────────────────── const segCounts = { unanimous: 0, majority: 0, unresolved: 0 }; for (const a of analyzed) segCounts[a.segment]++; console.log("SEGMENT OVERVIEW:"); console.log(` Unanimous: ${segCounts.unanimous.toLocaleString()} (${pct(segCounts.unanimous, analyzed.length)})`); console.log(` Majority: ${segCounts.majority.toLocaleString()} (${pct(segCounts.majority, analyzed.length)})`); console.log(` Unresolved: ${segCounts.unresolved.toLocaleString()} (${pct(segCounts.unresolved, analyzed.length)})`); // Cat vs spec disagreement breakdown const catSpecBreakdown = { catOnly: 0, specOnly: 0, both: 0 }; for (const a of analyzed) { if (a.segment === "unanimous") continue; const catDis = a.catSegment !== "cat-unanimous"; const specDis = a.specSegment !== "spec-unanimous"; if (catDis && specDis) catSpecBreakdown.both++; else if (catDis) catSpecBreakdown.catOnly++; else catSpecBreakdown.specOnly++; } const disputed = segCounts.majority + segCounts.unresolved; console.log(`\n Disagreement breakdown (of ${disputed.toLocaleString()} non-unanimous):`); console.log(` Category only: ${catSpecBreakdown.catOnly.toLocaleString()} (${pct(catSpecBreakdown.catOnly, disputed)})`); console.log(` Specificity only: ${catSpecBreakdown.specOnly.toLocaleString()} (${pct(catSpecBreakdown.specOnly, disputed)})`); console.log(` Both: ${catSpecBreakdown.both.toLocaleString()} (${pct(catSpecBreakdown.both, disputed)})`); // ── Distribution functions ────────────────────────────────────────── function buildDist(keyFn: (a: ParagraphAnalysis) => string) { const dist = new Map(); for (const a of analyzed) { const key = keyFn(a); let entry = dist.get(key); if (!entry) { entry = { total: 0, unanimous: 0, majority: 0, unresolved: 0 }; dist.set(key, entry); } entry.total++; entry[a.segment]++; } return dist; } // ── 1. By fiscal year ─────────────────────────────────────────────── printDistribution("BY FISCAL YEAR", buildDist(a => String(a.fiscalYear))); // ── 2. By filing type ─────────────────────────────────────────────── printDistribution("BY FILING TYPE", buildDist(a => a.filingType)); // ── 3. By SEC item ────────────────────────────────────────────────── printDistribution("BY SEC ITEM", buildDist(a => a.secItem)); // ── 4. By majority category ───────────────────────────────────────── printDistribution("BY MAJORITY CATEGORY", buildDist(a => a.majorityCat)); // ── 5. By majority specificity ────────────────────────────────────── const specLabels: Record = { 1: "1-Generic", 2: "2-Sector", 3: "3-Firm", 4: "4-Quantified" }; printDistribution("BY MAJORITY SPECIFICITY", buildDist(a => specLabels[a.majoritySpec] ?? String(a.majoritySpec))); // ── 6. By confidence pattern ──────────────────────────────────────── printDistribution("BY CATEGORY CONFIDENCE PATTERN", buildDist(a => a.catConfidences.sort().join("/"))); printDistribution("BY SPECIFICITY CONFIDENCE PATTERN", buildDist(a => a.specConfidences.sort().join("/"))); // ── 7. By word count quintile ─────────────────────────────────────── const wordCounts = analyzed.map(a => a.wordCount).sort((a, b) => a - b); const q20 = wordCounts[Math.floor(wordCounts.length * 0.2)]; const q40 = wordCounts[Math.floor(wordCounts.length * 0.4)]; const q60 = wordCounts[Math.floor(wordCounts.length * 0.6)]; const q80 = wordCounts[Math.floor(wordCounts.length * 0.8)]; console.log(`\n Word count quintile boundaries: ${q20}, ${q40}, ${q60}, ${q80}`); printDistribution("BY WORD COUNT QUINTILE", buildDist(a => { if (a.wordCount <= q20) return `Q1 (≤${q20})`; if (a.wordCount <= q40) return `Q2 (${q20+1}-${q40})`; if (a.wordCount <= q60) return `Q3 (${q40+1}-${q60})`; if (a.wordCount <= q80) return `Q4 (${q60+1}-${q80})`; return `Q5 (>${q80})`; })); // ── 8. By company size bucket ─────────────────────────────────────── printDistribution("BY COMPANY SIZE (paragraph count)", buildDist(a => { const n = companyParaCount.get(a.companyName) ?? 0; if (n <= 3) return "1-3 paras"; if (n <= 6) return "4-6 paras"; if (n <= 10) return "7-10 paras"; if (n <= 20) return "11-20 paras"; return "21+ paras"; })); // ── 9. Cross-tab: category × segment ──────────────────────────────── const catBySegment = new Map>(); for (const a of analyzed) { const key = a.majorityCat; let row = catBySegment.get(key); if (!row) { row = new Map(); catBySegment.set(key, row); } row.set(a.segment, (row.get(a.segment) ?? 0) + 1); } printCrossTab("CATEGORY × SEGMENT", catBySegment, ["unanimous", "majority", "unresolved"]); // ── 10. Cross-tab: specificity × segment ──────────────────────────── const specBySegment = new Map>(); for (const a of analyzed) { const key = specLabels[a.majoritySpec] ?? String(a.majoritySpec); let row = specBySegment.get(key); if (!row) { row = new Map(); specBySegment.set(key, row); } row.set(a.segment, (row.get(a.segment) ?? 0) + 1); } printCrossTab("SPECIFICITY × SEGMENT", specBySegment, ["unanimous", "majority", "unresolved"]); // ── 11. Cross-tab: fiscal year × category (for non-unanimous) ────── const yearByCat = new Map>(); for (const a of analyzed) { if (a.segment === "unanimous") continue; const key = String(a.fiscalYear); let row = yearByCat.get(key); if (!row) { row = new Map(); yearByCat.set(key, row); } row.set(a.majorityCat, (row.get(a.majorityCat) ?? 0) + 1); } printCrossTab("FISCAL YEAR × CATEGORY (non-unanimous only)", yearByCat); // ── 12. Top disagreement companies ────────────────────────────────── const companyDisagree = new Map(); for (const a of analyzed) { let entry = companyDisagree.get(a.companyName); if (!entry) { entry = { total: 0, disputed: 0 }; companyDisagree.set(a.companyName, entry); } entry.total++; if (a.segment !== "unanimous") entry.disputed++; } console.log(`\n${"═".repeat(70)}`); console.log(" TOP 30 COMPANIES BY DISAGREEMENT RATE (min 5 paragraphs)"); console.log(`${"═".repeat(70)}`); const companyRanked = [...companyDisagree.entries()] .filter(([, v]) => v.total >= 5) .map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total })) .sort((a, b) => b.rate - a.rate) .slice(0, 30); for (const c of companyRanked) { console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`); } // ── 13. Bottom 30 companies (lowest disagreement) ────────────────── console.log(`\n${"═".repeat(70)}`); console.log(" TOP 30 COMPANIES BY AGREEMENT RATE (min 5 paragraphs)"); console.log(`${"═".repeat(70)}`); const companyAgreed = [...companyDisagree.entries()] .filter(([, v]) => v.total >= 5) .map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total })) .sort((a, b) => a.rate - b.rate) .slice(0, 30); for (const c of companyAgreed) { console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`); } // ── 14. Specificity spread analysis ───────────────────────────────── console.log(`\n${"═".repeat(70)}`); console.log(" SPECIFICITY SPREAD (max - min) FOR NON-UNANIMOUS"); console.log(`${"═".repeat(70)}`); const specSpread = new Map(); for (const a of analyzed) { if (a.specSegment === "spec-unanimous") continue; const spread = Math.max(...a.specs) - Math.min(...a.specs); const key = `spread-${spread}`; specSpread.set(key, (specSpread.get(key) ?? 0) + 1); } for (const [key, count] of [...specSpread.entries()].sort()) { console.log(` ${key}: ${count.toLocaleString()}`); } // ── 15. Most common category dispute pairs ────────────────────────── console.log(`\n${"═".repeat(70)}`); console.log(" TOP CATEGORY DISPUTE PAIRS"); console.log(`${"═".repeat(70)}`); const catPairs = new Map(); for (const a of analyzed) { if (a.catSegment === "cat-unanimous") continue; const sorted = [...new Set(a.cats)].sort(); const key = sorted.join(" ↔ "); catPairs.set(key, (catPairs.get(key) ?? 0) + 1); } for (const [pair, count] of [...catPairs.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) { console.log(` ${pair.padEnd(55)} ${count.toLocaleString()}`); } // ── 16. Spec dispute patterns ─────────────────────────────────────── console.log(`\n${"═".repeat(70)}`); console.log(" TOP SPECIFICITY DISPUTE PATTERNS"); console.log(`${"═".repeat(70)}`); const specPatterns = new Map(); for (const a of analyzed) { if (a.specSegment === "spec-unanimous") continue; const sorted = [...a.specs].sort((a, b) => a - b); const key = `[${sorted.join(",")}]`; specPatterns.set(key, (specPatterns.get(key) ?? 0) + 1); } for (const [pattern, count] of [...specPatterns.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) { console.log(` ${pattern.padEnd(20)} ${count.toLocaleString()}`); } // ── 17. Confidence vs agreement rate ──────────────────────────────── console.log(`\n${"═".repeat(70)}`); console.log(" AVERAGE CONFIDENCE BY SEGMENT"); console.log(`${"═".repeat(70)}`); const confScore = (c: string) => c === "high" ? 3 : c === "medium" ? 2 : 1; for (const seg of ["unanimous", "majority", "unresolved"] as const) { const group = analyzed.filter(a => a.segment === seg); const avgCatConf = group.reduce((s, a) => s + a.catConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length; const avgSpecConf = group.reduce((s, a) => s + a.specConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length; console.log(` ${seg.padEnd(12)} avg cat conf: ${avgCatConf.toFixed(2)} avg spec conf: ${avgSpecConf.toFixed(2)}`); } // ── 18. All-low-confidence counts ─────────────────────────────────── console.log(`\n${"═".repeat(70)}`); console.log(" ALL-LOW-CONFIDENCE PATTERNS"); console.log(`${"═".repeat(70)}`); const allLowCat = analyzed.filter(a => a.catConfidences.every(c => c === "low")); const allLowSpec = analyzed.filter(a => a.specConfidences.every(c => c === "low")); const allLowBoth = analyzed.filter(a => a.catConfidences.every(c => c === "low") && a.specConfidences.every(c => c === "low")); console.log(` All-low cat confidence: ${allLowCat.length} (${pct(allLowCat.length, analyzed.length)})`); console.log(` All-low spec confidence: ${allLowSpec.length} (${pct(allLowSpec.length, analyzed.length)})`); console.log(` All-low both: ${allLowBoth.length} (${pct(allLowBoth.length, analyzed.length)})`); // Of those, segment distribution for (const [label, group] of [["All-low cat", allLowCat], ["All-low spec", allLowSpec]] as const) { const segDist = { unanimous: 0, majority: 0, unresolved: 0 }; for (const a of group) segDist[a.segment]++; console.log(` ${label} → unanimous: ${segDist.unanimous}, majority: ${segDist.majority}, unresolved: ${segDist.unresolved}`); } } main().catch(console.error);