433 lines
20 KiB
TypeScript
433 lines
20 KiB
TypeScript
/**
|
||
* Cross-tabulate agreement status against all paragraph metadata dimensions.
|
||
*
|
||
* Segments every paragraph into: unanimous | majority | unresolved
|
||
* Then breaks down by: fiscal year, filing type, sec item, category,
|
||
* specificity, confidence, company size (paragraph count proxy),
|
||
* word count quintile, and cross-dimensions.
|
||
*
|
||
* Usage: bun ts/scripts/segment-analysis.ts
|
||
*/
|
||
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
|
||
import { Paragraph } from "../src/schemas/paragraph.ts";
|
||
|
||
const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
|
||
const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
|
||
|
||
// ── Types ──────────────────────────────────────────────────────────────
|
||
interface Ann {
|
||
paragraphId: string;
|
||
label: {
|
||
content_category: string;
|
||
specificity_level: number;
|
||
category_confidence: string;
|
||
specificity_confidence: string;
|
||
reasoning: string;
|
||
};
|
||
provenance: {
|
||
modelId: string;
|
||
costUsd: number;
|
||
inputTokens: number;
|
||
outputTokens: number;
|
||
reasoningTokens: number;
|
||
latencyMs: number;
|
||
requestedAt: string;
|
||
};
|
||
}
|
||
|
||
type Segment = "unanimous" | "majority" | "unresolved";
|
||
|
||
interface ParagraphAnalysis {
|
||
id: string;
|
||
segment: Segment;
|
||
catSegment: "cat-unanimous" | "cat-majority" | "cat-split";
|
||
specSegment: "spec-unanimous" | "spec-majority" | "spec-split";
|
||
majorityCat: string;
|
||
majoritySpec: number;
|
||
cats: string[];
|
||
specs: number[];
|
||
catConfidences: string[];
|
||
specConfidences: string[];
|
||
// Filing metadata
|
||
companyName: string;
|
||
ticker: string;
|
||
filingType: string;
|
||
filingDate: string;
|
||
fiscalYear: number;
|
||
secItem: string;
|
||
wordCount: number;
|
||
}
|
||
|
||
// ── Helpers ────────────────────────────────────────────────────────────
|
||
function pct(n: number, total: number): string {
|
||
return total === 0 ? "0.0%" : `${((n / total) * 100).toFixed(1)}%`;
|
||
}
|
||
|
||
function majority<T>(arr: T[]): { value: T; count: number } | null {
|
||
const counts = new Map<T, number>();
|
||
for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
|
||
let best: T | null = null;
|
||
let bestCount = 0;
|
||
for (const [v, c] of counts) {
|
||
if (c > bestCount) { best = v; bestCount = c; }
|
||
}
|
||
return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
|
||
}
|
||
|
||
function printDistribution(label: string, counts: Map<string, { total: number; unanimous: number; majority: number; unresolved: number }>) {
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(` ${label}`);
|
||
console.log(`${"═".repeat(70)}`);
|
||
|
||
const sorted = [...counts.entries()].sort(([, a], [, b]) => b.total - a.total);
|
||
const maxKeyLen = Math.max(...sorted.map(([k]) => k.length), 20);
|
||
|
||
console.log(
|
||
` ${"".padEnd(maxKeyLen)} ${"Total".padStart(7)} ${"Unan".padStart(7)} ${"Maj".padStart(7)} ${"Unres".padStart(7)} ${"Unan%".padStart(7)} ${"Unres%".padStart(7)}`
|
||
);
|
||
console.log(` ${"─".repeat(maxKeyLen + 50)}`);
|
||
|
||
for (const [key, v] of sorted) {
|
||
console.log(
|
||
` ${key.padEnd(maxKeyLen)} ${String(v.total).padStart(7)} ${String(v.unanimous).padStart(7)} ${String(v.majority).padStart(7)} ${String(v.unresolved).padStart(7)} ${pct(v.unanimous, v.total).padStart(7)} ${pct(v.unresolved, v.total).padStart(7)}`
|
||
);
|
||
}
|
||
}
|
||
|
||
function printCrossTab(label: string, rows: Map<string, Map<string, number>>, colOrder?: string[]) {
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(` ${label}`);
|
||
console.log(`${"═".repeat(70)}`);
|
||
|
||
const allCols = colOrder ?? [...new Set([...rows.values()].flatMap(m => [...m.keys()]))].sort();
|
||
const maxKeyLen = Math.max(...[...rows.keys()].map(k => k.length), 15);
|
||
const colWidth = 8;
|
||
|
||
console.log(
|
||
` ${"".padEnd(maxKeyLen)} ${allCols.map(c => c.slice(0, colWidth).padStart(colWidth)).join(" ")}`
|
||
);
|
||
console.log(` ${"─".repeat(maxKeyLen + (colWidth + 2) * allCols.length)}`);
|
||
|
||
const sortedRows = [...rows.entries()].sort(([a], [b]) => a.localeCompare(b));
|
||
for (const [key, cols] of sortedRows) {
|
||
const total = [...cols.values()].reduce((a, b) => a + b, 0);
|
||
const cells = allCols.map(c => {
|
||
const n = cols.get(c) ?? 0;
|
||
return `${pct(n, total)}`.padStart(colWidth);
|
||
});
|
||
console.log(` ${key.padEnd(maxKeyLen)} ${cells.join(" ")} (n=${total})`);
|
||
}
|
||
}
|
||
|
||
// ── Main ───────────────────────────────────────────────────────────────
|
||
async function main() {
|
||
console.log("Loading paragraphs...");
|
||
const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
|
||
console.log(` ${paragraphs.length.toLocaleString()} paragraphs`);
|
||
|
||
const paraById = new Map(paragraphs.map(p => [p.id, p]));
|
||
|
||
console.log("Loading annotations...");
|
||
const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
|
||
const anns = rawAnns as Ann[];
|
||
console.log(` ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);
|
||
|
||
// Group annotations by paragraph
|
||
const byParagraph = new Map<string, Ann[]>();
|
||
for (const a of anns) {
|
||
let arr = byParagraph.get(a.paragraphId);
|
||
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
|
||
arr.push(a);
|
||
}
|
||
|
||
// Count paragraphs per company (for company-size bucketing)
|
||
const companyParaCount = new Map<string, number>();
|
||
for (const p of paragraphs) {
|
||
const name = p.filing.companyName;
|
||
companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
|
||
}
|
||
|
||
// ── Analyze each paragraph ──────────────────────────────────────────
|
||
const analyzed: ParagraphAnalysis[] = [];
|
||
|
||
for (const [pid, pannAnns] of byParagraph) {
|
||
if (pannAnns.length < 3) continue;
|
||
const para = paraById.get(pid);
|
||
if (!para) continue;
|
||
|
||
const cats = pannAnns.map(a => a.label.content_category);
|
||
const specs = pannAnns.map(a => a.label.specificity_level);
|
||
const catConfidences = pannAnns.map(a => a.label.category_confidence);
|
||
const specConfidences = pannAnns.map(a => a.label.specificity_confidence);
|
||
|
||
const catMaj = majority(cats);
|
||
const specMaj = majority(specs);
|
||
|
||
const catSeg = cats.every(c => c === cats[0]) ? "cat-unanimous"
|
||
: catMaj ? "cat-majority" : "cat-split";
|
||
const specSeg = specs.every(s => s === specs[0]) ? "spec-unanimous"
|
||
: specMaj ? "spec-majority" : "spec-split";
|
||
|
||
let segment: Segment;
|
||
if (catSeg === "cat-unanimous" && specSeg === "spec-unanimous") {
|
||
segment = "unanimous";
|
||
} else if (catMaj && specMaj) {
|
||
segment = "majority";
|
||
} else {
|
||
segment = "unresolved";
|
||
}
|
||
|
||
analyzed.push({
|
||
id: pid,
|
||
segment,
|
||
catSegment: catSeg,
|
||
specSegment: specSeg,
|
||
majorityCat: catMaj?.value ?? cats[0],
|
||
majoritySpec: specMaj?.value ?? specs[0],
|
||
cats,
|
||
specs,
|
||
catConfidences,
|
||
specConfidences,
|
||
companyName: para.filing.companyName,
|
||
ticker: para.filing.ticker,
|
||
filingType: para.filing.filingType,
|
||
filingDate: para.filing.filingDate,
|
||
fiscalYear: para.filing.fiscalYear,
|
||
secItem: para.filing.secItem,
|
||
wordCount: para.wordCount,
|
||
});
|
||
}
|
||
|
||
console.log(`\n${analyzed.length.toLocaleString()} paragraphs analyzed\n`);
|
||
|
||
// ── Overview ─────────────────────────────────────────────────────────
|
||
const segCounts = { unanimous: 0, majority: 0, unresolved: 0 };
|
||
for (const a of analyzed) segCounts[a.segment]++;
|
||
console.log("SEGMENT OVERVIEW:");
|
||
console.log(` Unanimous: ${segCounts.unanimous.toLocaleString()} (${pct(segCounts.unanimous, analyzed.length)})`);
|
||
console.log(` Majority: ${segCounts.majority.toLocaleString()} (${pct(segCounts.majority, analyzed.length)})`);
|
||
console.log(` Unresolved: ${segCounts.unresolved.toLocaleString()} (${pct(segCounts.unresolved, analyzed.length)})`);
|
||
|
||
// Cat vs spec disagreement breakdown
|
||
const catSpecBreakdown = { catOnly: 0, specOnly: 0, both: 0 };
|
||
for (const a of analyzed) {
|
||
if (a.segment === "unanimous") continue;
|
||
const catDis = a.catSegment !== "cat-unanimous";
|
||
const specDis = a.specSegment !== "spec-unanimous";
|
||
if (catDis && specDis) catSpecBreakdown.both++;
|
||
else if (catDis) catSpecBreakdown.catOnly++;
|
||
else catSpecBreakdown.specOnly++;
|
||
}
|
||
const disputed = segCounts.majority + segCounts.unresolved;
|
||
console.log(`\n Disagreement breakdown (of ${disputed.toLocaleString()} non-unanimous):`);
|
||
console.log(` Category only: ${catSpecBreakdown.catOnly.toLocaleString()} (${pct(catSpecBreakdown.catOnly, disputed)})`);
|
||
console.log(` Specificity only: ${catSpecBreakdown.specOnly.toLocaleString()} (${pct(catSpecBreakdown.specOnly, disputed)})`);
|
||
console.log(` Both: ${catSpecBreakdown.both.toLocaleString()} (${pct(catSpecBreakdown.both, disputed)})`);
|
||
|
||
// ── Distribution functions ──────────────────────────────────────────
|
||
function buildDist(keyFn: (a: ParagraphAnalysis) => string) {
|
||
const dist = new Map<string, { total: number; unanimous: number; majority: number; unresolved: number }>();
|
||
for (const a of analyzed) {
|
||
const key = keyFn(a);
|
||
let entry = dist.get(key);
|
||
if (!entry) { entry = { total: 0, unanimous: 0, majority: 0, unresolved: 0 }; dist.set(key, entry); }
|
||
entry.total++;
|
||
entry[a.segment]++;
|
||
}
|
||
return dist;
|
||
}
|
||
|
||
// ── 1. By fiscal year ───────────────────────────────────────────────
|
||
printDistribution("BY FISCAL YEAR", buildDist(a => String(a.fiscalYear)));
|
||
|
||
// ── 2. By filing type ───────────────────────────────────────────────
|
||
printDistribution("BY FILING TYPE", buildDist(a => a.filingType));
|
||
|
||
// ── 3. By SEC item ──────────────────────────────────────────────────
|
||
printDistribution("BY SEC ITEM", buildDist(a => a.secItem));
|
||
|
||
// ── 4. By majority category ─────────────────────────────────────────
|
||
printDistribution("BY MAJORITY CATEGORY", buildDist(a => a.majorityCat));
|
||
|
||
// ── 5. By majority specificity ──────────────────────────────────────
|
||
const specLabels: Record<number, string> = {
|
||
1: "1-Generic", 2: "2-Sector", 3: "3-Firm", 4: "4-Quantified"
|
||
};
|
||
printDistribution("BY MAJORITY SPECIFICITY", buildDist(a => specLabels[a.majoritySpec] ?? String(a.majoritySpec)));
|
||
|
||
// ── 6. By confidence pattern ────────────────────────────────────────
|
||
printDistribution("BY CATEGORY CONFIDENCE PATTERN",
|
||
buildDist(a => a.catConfidences.sort().join("/")));
|
||
printDistribution("BY SPECIFICITY CONFIDENCE PATTERN",
|
||
buildDist(a => a.specConfidences.sort().join("/")));
|
||
|
||
// ── 7. By word count quintile ───────────────────────────────────────
|
||
const wordCounts = analyzed.map(a => a.wordCount).sort((a, b) => a - b);
|
||
const q20 = wordCounts[Math.floor(wordCounts.length * 0.2)];
|
||
const q40 = wordCounts[Math.floor(wordCounts.length * 0.4)];
|
||
const q60 = wordCounts[Math.floor(wordCounts.length * 0.6)];
|
||
const q80 = wordCounts[Math.floor(wordCounts.length * 0.8)];
|
||
console.log(`\n Word count quintile boundaries: ${q20}, ${q40}, ${q60}, ${q80}`);
|
||
printDistribution("BY WORD COUNT QUINTILE", buildDist(a => {
|
||
if (a.wordCount <= q20) return `Q1 (≤${q20})`;
|
||
if (a.wordCount <= q40) return `Q2 (${q20+1}-${q40})`;
|
||
if (a.wordCount <= q60) return `Q3 (${q40+1}-${q60})`;
|
||
if (a.wordCount <= q80) return `Q4 (${q60+1}-${q80})`;
|
||
return `Q5 (>${q80})`;
|
||
}));
|
||
|
||
// ── 8. By company size bucket ───────────────────────────────────────
|
||
printDistribution("BY COMPANY SIZE (paragraph count)", buildDist(a => {
|
||
const n = companyParaCount.get(a.companyName) ?? 0;
|
||
if (n <= 3) return "1-3 paras";
|
||
if (n <= 6) return "4-6 paras";
|
||
if (n <= 10) return "7-10 paras";
|
||
if (n <= 20) return "11-20 paras";
|
||
return "21+ paras";
|
||
}));
|
||
|
||
// ── 9. Cross-tab: category × segment ────────────────────────────────
|
||
const catBySegment = new Map<string, Map<string, number>>();
|
||
for (const a of analyzed) {
|
||
const key = a.majorityCat;
|
||
let row = catBySegment.get(key);
|
||
if (!row) { row = new Map(); catBySegment.set(key, row); }
|
||
row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
|
||
}
|
||
printCrossTab("CATEGORY × SEGMENT", catBySegment, ["unanimous", "majority", "unresolved"]);
|
||
|
||
// ── 10. Cross-tab: specificity × segment ────────────────────────────
|
||
const specBySegment = new Map<string, Map<string, number>>();
|
||
for (const a of analyzed) {
|
||
const key = specLabels[a.majoritySpec] ?? String(a.majoritySpec);
|
||
let row = specBySegment.get(key);
|
||
if (!row) { row = new Map(); specBySegment.set(key, row); }
|
||
row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
|
||
}
|
||
printCrossTab("SPECIFICITY × SEGMENT", specBySegment, ["unanimous", "majority", "unresolved"]);
|
||
|
||
// ── 11. Cross-tab: fiscal year × category (for non-unanimous) ──────
|
||
const yearByCat = new Map<string, Map<string, number>>();
|
||
for (const a of analyzed) {
|
||
if (a.segment === "unanimous") continue;
|
||
const key = String(a.fiscalYear);
|
||
let row = yearByCat.get(key);
|
||
if (!row) { row = new Map(); yearByCat.set(key, row); }
|
||
row.set(a.majorityCat, (row.get(a.majorityCat) ?? 0) + 1);
|
||
}
|
||
printCrossTab("FISCAL YEAR × CATEGORY (non-unanimous only)", yearByCat);
|
||
|
||
// ── 12. Top disagreement companies ──────────────────────────────────
|
||
const companyDisagree = new Map<string, { total: number; disputed: number }>();
|
||
for (const a of analyzed) {
|
||
let entry = companyDisagree.get(a.companyName);
|
||
if (!entry) { entry = { total: 0, disputed: 0 }; companyDisagree.set(a.companyName, entry); }
|
||
entry.total++;
|
||
if (a.segment !== "unanimous") entry.disputed++;
|
||
}
|
||
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" TOP 30 COMPANIES BY DISAGREEMENT RATE (min 5 paragraphs)");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const companyRanked = [...companyDisagree.entries()]
|
||
.filter(([, v]) => v.total >= 5)
|
||
.map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
|
||
.sort((a, b) => b.rate - a.rate)
|
||
.slice(0, 30);
|
||
|
||
for (const c of companyRanked) {
|
||
console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
|
||
}
|
||
|
||
// ── 13. Bottom 30 companies (lowest disagreement) ──────────────────
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" TOP 30 COMPANIES BY AGREEMENT RATE (min 5 paragraphs)");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const companyAgreed = [...companyDisagree.entries()]
|
||
.filter(([, v]) => v.total >= 5)
|
||
.map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
|
||
.sort((a, b) => a.rate - b.rate)
|
||
.slice(0, 30);
|
||
|
||
for (const c of companyAgreed) {
|
||
console.log(` ${c.name.slice(0, 45).padEnd(45)} ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
|
||
}
|
||
|
||
// ── 14. Specificity spread analysis ─────────────────────────────────
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" SPECIFICITY SPREAD (max - min) FOR NON-UNANIMOUS");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const specSpread = new Map<string, number>();
|
||
for (const a of analyzed) {
|
||
if (a.specSegment === "spec-unanimous") continue;
|
||
const spread = Math.max(...a.specs) - Math.min(...a.specs);
|
||
const key = `spread-${spread}`;
|
||
specSpread.set(key, (specSpread.get(key) ?? 0) + 1);
|
||
}
|
||
for (const [key, count] of [...specSpread.entries()].sort()) {
|
||
console.log(` ${key}: ${count.toLocaleString()}`);
|
||
}
|
||
|
||
// ── 15. Most common category dispute pairs ──────────────────────────
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" TOP CATEGORY DISPUTE PAIRS");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const catPairs = new Map<string, number>();
|
||
for (const a of analyzed) {
|
||
if (a.catSegment === "cat-unanimous") continue;
|
||
const sorted = [...new Set(a.cats)].sort();
|
||
const key = sorted.join(" ↔ ");
|
||
catPairs.set(key, (catPairs.get(key) ?? 0) + 1);
|
||
}
|
||
for (const [pair, count] of [...catPairs.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
|
||
console.log(` ${pair.padEnd(55)} ${count.toLocaleString()}`);
|
||
}
|
||
|
||
// ── 16. Spec dispute patterns ───────────────────────────────────────
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" TOP SPECIFICITY DISPUTE PATTERNS");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const specPatterns = new Map<string, number>();
|
||
for (const a of analyzed) {
|
||
if (a.specSegment === "spec-unanimous") continue;
|
||
const sorted = [...a.specs].sort((a, b) => a - b);
|
||
const key = `[${sorted.join(",")}]`;
|
||
specPatterns.set(key, (specPatterns.get(key) ?? 0) + 1);
|
||
}
|
||
for (const [pattern, count] of [...specPatterns.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
|
||
console.log(` ${pattern.padEnd(20)} ${count.toLocaleString()}`);
|
||
}
|
||
|
||
// ── 17. Confidence vs agreement rate ────────────────────────────────
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" AVERAGE CONFIDENCE BY SEGMENT");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const confScore = (c: string) => c === "high" ? 3 : c === "medium" ? 2 : 1;
|
||
for (const seg of ["unanimous", "majority", "unresolved"] as const) {
|
||
const group = analyzed.filter(a => a.segment === seg);
|
||
const avgCatConf = group.reduce((s, a) => s + a.catConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
|
||
const avgSpecConf = group.reduce((s, a) => s + a.specConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
|
||
console.log(` ${seg.padEnd(12)} avg cat conf: ${avgCatConf.toFixed(2)} avg spec conf: ${avgSpecConf.toFixed(2)}`);
|
||
}
|
||
|
||
// ── 18. All-low-confidence counts ───────────────────────────────────
|
||
console.log(`\n${"═".repeat(70)}`);
|
||
console.log(" ALL-LOW-CONFIDENCE PATTERNS");
|
||
console.log(`${"═".repeat(70)}`);
|
||
const allLowCat = analyzed.filter(a => a.catConfidences.every(c => c === "low"));
|
||
const allLowSpec = analyzed.filter(a => a.specConfidences.every(c => c === "low"));
|
||
const allLowBoth = analyzed.filter(a => a.catConfidences.every(c => c === "low") && a.specConfidences.every(c => c === "low"));
|
||
console.log(` All-low cat confidence: ${allLowCat.length} (${pct(allLowCat.length, analyzed.length)})`);
|
||
console.log(` All-low spec confidence: ${allLowSpec.length} (${pct(allLowSpec.length, analyzed.length)})`);
|
||
console.log(` All-low both: ${allLowBoth.length} (${pct(allLowBoth.length, analyzed.length)})`);
|
||
|
||
// Of those, segment distribution
|
||
for (const [label, group] of [["All-low cat", allLowCat], ["All-low spec", allLowSpec]] as const) {
|
||
const segDist = { unanimous: 0, majority: 0, unresolved: 0 };
|
||
for (const a of group) segDist[a.segment]++;
|
||
console.log(` ${label} → unanimous: ${segDist.unanimous}, majority: ${segDist.majority}, unresolved: ${segDist.unresolved}`);
|
||
}
|
||
}
|
||
|
||
main().catch(console.error);
|