SEC-cyBERT/ts/scripts/segment-analysis.ts

/**
 * Cross-tabulate agreement status against all paragraph metadata dimensions.
 *
 * Segments every paragraph into: unanimous | majority | unresolved
 * Then breaks down by: fiscal year, filing type, sec item, category,
 * specificity, confidence, company size (paragraph count proxy),
 * word count quintile, and cross-dimensions.
 *
 * Usage: bun ts/scripts/segment-analysis.ts
 */
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
import { Paragraph } from "../src/schemas/paragraph.ts";

const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;

// ── Types ──────────────────────────────────────────────────────────────
interface Ann {
  paragraphId: string;
  label: {
    content_category: string;
    specificity_level: number;
    category_confidence: string;
    specificity_confidence: string;
    reasoning: string;
  };
  provenance: {
    modelId: string;
    costUsd: number;
    inputTokens: number;
    outputTokens: number;
    reasoningTokens: number;
    latencyMs: number;
    requestedAt: string;
  };
}

type Segment = "unanimous" | "majority" | "unresolved";

interface ParagraphAnalysis {
  id: string;
  segment: Segment;
  catSegment: "cat-unanimous" | "cat-majority" | "cat-split";
  specSegment: "spec-unanimous" | "spec-majority" | "spec-split";
  majorityCat: string;
  majoritySpec: number;
  cats: string[];
  specs: number[];
  catConfidences: string[];
  specConfidences: string[];
  // Filing metadata
  companyName: string;
  ticker: string;
  filingType: string;
  filingDate: string;
  fiscalYear: number;
  secItem: string;
  wordCount: number;
}

// ── Helpers ────────────────────────────────────────────────────────────
function pct(n: number, total: number): string {
  return total === 0 ? "0.0%" : `${((n / total) * 100).toFixed(1)}%`;
}

function majority<T>(arr: T[]): { value: T; count: number } | null {
  const counts = new Map<T, number>();
  for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
  let best: T | null = null;
  let bestCount = 0;
  for (const [v, c] of counts) {
    if (c > bestCount) { best = v; bestCount = c; }
  }
  return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
}

function printDistribution(label: string, counts: Map<string, { total: number; unanimous: number; majority: number; unresolved: number }>) {
  console.log(`\n${"═".repeat(70)}`);
  console.log(`  ${label}`);
  console.log(`${"═".repeat(70)}`);

  const sorted = [...counts.entries()].sort(([, a], [, b]) => b.total - a.total);
  const maxKeyLen = Math.max(...sorted.map(([k]) => k.length), 20);

  console.log(
    `  ${"".padEnd(maxKeyLen)}  ${"Total".padStart(7)}  ${"Unan".padStart(7)}  ${"Maj".padStart(7)}  ${"Unres".padStart(7)}  ${"Unan%".padStart(7)}  ${"Unres%".padStart(7)}`
  );
  console.log(`  ${"─".repeat(maxKeyLen + 50)}`);

  for (const [key, v] of sorted) {
    console.log(
      `  ${key.padEnd(maxKeyLen)}  ${String(v.total).padStart(7)}  ${String(v.unanimous).padStart(7)}  ${String(v.majority).padStart(7)}  ${String(v.unresolved).padStart(7)}  ${pct(v.unanimous, v.total).padStart(7)}  ${pct(v.unresolved, v.total).padStart(7)}`
    );
  }
}

function printCrossTab(label: string, rows: Map<string, Map<string, number>>, colOrder?: string[]) {
  console.log(`\n${"═".repeat(70)}`);
  console.log(`  ${label}`);
  console.log(`${"═".repeat(70)}`);

  const allCols = colOrder ?? [...new Set([...rows.values()].flatMap(m => [...m.keys()]))].sort();
  const maxKeyLen = Math.max(...[...rows.keys()].map(k => k.length), 15);
  const colWidth = 8;

  console.log(
    `  ${"".padEnd(maxKeyLen)}  ${allCols.map(c => c.slice(0, colWidth).padStart(colWidth)).join("  ")}`
  );
  console.log(`  ${"─".repeat(maxKeyLen + (colWidth + 2) * allCols.length)}`);

  const sortedRows = [...rows.entries()].sort(([a], [b]) => a.localeCompare(b));
  for (const [key, cols] of sortedRows) {
    const total = [...cols.values()].reduce((a, b) => a + b, 0);
    const cells = allCols.map(c => {
      const n = cols.get(c) ?? 0;
      return `${pct(n, total)}`.padStart(colWidth);
    });
    console.log(`  ${key.padEnd(maxKeyLen)}  ${cells.join("  ")}  (n=${total})`);
  }
}

// ── Main ───────────────────────────────────────────────────────────────
async function main() {
  console.log("Loading paragraphs...");
  const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
  console.log(`  ${paragraphs.length.toLocaleString()} paragraphs`);

  const paraById = new Map(paragraphs.map(p => [p.id, p]));

  console.log("Loading annotations...");
  const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
  const anns = rawAnns as Ann[];
  console.log(`  ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);

  // Group annotations by paragraph
  const byParagraph = new Map<string, Ann[]>();
  for (const a of anns) {
    let arr = byParagraph.get(a.paragraphId);
    if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
    arr.push(a);
  }

  // Count paragraphs per company (for company-size bucketing)
  const companyParaCount = new Map<string, number>();
  for (const p of paragraphs) {
    const name = p.filing.companyName;
    companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
  }

  // ── Analyze each paragraph ──────────────────────────────────────────
  const analyzed: ParagraphAnalysis[] = [];

  for (const [pid, pannAnns] of byParagraph) {
    if (pannAnns.length < 3) continue;
    const para = paraById.get(pid);
    if (!para) continue;

    const cats = pannAnns.map(a => a.label.content_category);
    const specs = pannAnns.map(a => a.label.specificity_level);
    const catConfidences = pannAnns.map(a => a.label.category_confidence);
    const specConfidences = pannAnns.map(a => a.label.specificity_confidence);

    const catMaj = majority(cats);
    const specMaj = majority(specs);

    const catSeg = cats.every(c => c === cats[0]) ? "cat-unanimous"
      : catMaj ? "cat-majority" : "cat-split";
    const specSeg = specs.every(s => s === specs[0]) ? "spec-unanimous"
      : specMaj ? "spec-majority" : "spec-split";

    let segment: Segment;
    if (catSeg === "cat-unanimous" && specSeg === "spec-unanimous") {
      segment = "unanimous";
    } else if (catMaj && specMaj) {
      segment = "majority";
    } else {
      segment = "unresolved";
    }

    analyzed.push({
      id: pid,
      segment,
      catSegment: catSeg,
      specSegment: specSeg,
      majorityCat: catMaj?.value ?? cats[0],
      majoritySpec: specMaj?.value ?? specs[0],
      cats,
      specs,
      catConfidences,
      specConfidences,
      companyName: para.filing.companyName,
      ticker: para.filing.ticker,
      filingType: para.filing.filingType,
      filingDate: para.filing.filingDate,
      fiscalYear: para.filing.fiscalYear,
      secItem: para.filing.secItem,
      wordCount: para.wordCount,
    });
  }

  console.log(`\n${analyzed.length.toLocaleString()} paragraphs analyzed\n`);

  // ── Overview ─────────────────────────────────────────────────────────
  const segCounts = { unanimous: 0, majority: 0, unresolved: 0 };
  for (const a of analyzed) segCounts[a.segment]++;
  console.log("SEGMENT OVERVIEW:");
  console.log(`  Unanimous:  ${segCounts.unanimous.toLocaleString()} (${pct(segCounts.unanimous, analyzed.length)})`);
  console.log(`  Majority:   ${segCounts.majority.toLocaleString()} (${pct(segCounts.majority, analyzed.length)})`);
  console.log(`  Unresolved: ${segCounts.unresolved.toLocaleString()} (${pct(segCounts.unresolved, analyzed.length)})`);

  // Cat vs spec disagreement breakdown
  const catSpecBreakdown = { catOnly: 0, specOnly: 0, both: 0 };
  for (const a of analyzed) {
    if (a.segment === "unanimous") continue;
    const catDis = a.catSegment !== "cat-unanimous";
    const specDis = a.specSegment !== "spec-unanimous";
    if (catDis && specDis) catSpecBreakdown.both++;
    else if (catDis) catSpecBreakdown.catOnly++;
    else catSpecBreakdown.specOnly++;
  }
  const disputed = segCounts.majority + segCounts.unresolved;
  console.log(`\n  Disagreement breakdown (of ${disputed.toLocaleString()} non-unanimous):`);
  console.log(`    Category only:    ${catSpecBreakdown.catOnly.toLocaleString()} (${pct(catSpecBreakdown.catOnly, disputed)})`);
  console.log(`    Specificity only: ${catSpecBreakdown.specOnly.toLocaleString()} (${pct(catSpecBreakdown.specOnly, disputed)})`);
  console.log(`    Both:             ${catSpecBreakdown.both.toLocaleString()} (${pct(catSpecBreakdown.both, disputed)})`);

  // ── Distribution functions ──────────────────────────────────────────
  function buildDist(keyFn: (a: ParagraphAnalysis) => string) {
    const dist = new Map<string, { total: number; unanimous: number; majority: number; unresolved: number }>();
    for (const a of analyzed) {
      const key = keyFn(a);
      let entry = dist.get(key);
      if (!entry) { entry = { total: 0, unanimous: 0, majority: 0, unresolved: 0 }; dist.set(key, entry); }
      entry.total++;
      entry[a.segment]++;
    }
    return dist;
  }

  // ── 1. By fiscal year ───────────────────────────────────────────────
  printDistribution("BY FISCAL YEAR", buildDist(a => String(a.fiscalYear)));

  // ── 2. By filing type ───────────────────────────────────────────────
  printDistribution("BY FILING TYPE", buildDist(a => a.filingType));

  // ── 3. By SEC item ──────────────────────────────────────────────────
  printDistribution("BY SEC ITEM", buildDist(a => a.secItem));

  // ── 4. By majority category ─────────────────────────────────────────
  printDistribution("BY MAJORITY CATEGORY", buildDist(a => a.majorityCat));

  // ── 5. By majority specificity ──────────────────────────────────────
  const specLabels: Record<number, string> = {
    1: "1-Generic", 2: "2-Sector", 3: "3-Firm", 4: "4-Quantified"
  };
  printDistribution("BY MAJORITY SPECIFICITY", buildDist(a => specLabels[a.majoritySpec] ?? String(a.majoritySpec)));

  // ── 6. By confidence pattern ────────────────────────────────────────
  printDistribution("BY CATEGORY CONFIDENCE PATTERN",
    buildDist(a => a.catConfidences.sort().join("/")));
  printDistribution("BY SPECIFICITY CONFIDENCE PATTERN",
    buildDist(a => a.specConfidences.sort().join("/")));

  // ── 7. By word count quintile ───────────────────────────────────────
  const wordCounts = analyzed.map(a => a.wordCount).sort((a, b) => a - b);
  const q20 = wordCounts[Math.floor(wordCounts.length * 0.2)];
  const q40 = wordCounts[Math.floor(wordCounts.length * 0.4)];
  const q60 = wordCounts[Math.floor(wordCounts.length * 0.6)];
  const q80 = wordCounts[Math.floor(wordCounts.length * 0.8)];
  console.log(`\n  Word count quintile boundaries: ${q20}, ${q40}, ${q60}, ${q80}`);
  printDistribution("BY WORD COUNT QUINTILE", buildDist(a => {
    if (a.wordCount <= q20) return `Q1 (≤${q20})`;
    if (a.wordCount <= q40) return `Q2 (${q20+1}-${q40})`;
    if (a.wordCount <= q60) return `Q3 (${q40+1}-${q60})`;
    if (a.wordCount <= q80) return `Q4 (${q60+1}-${q80})`;
    return `Q5 (>${q80})`;
  }));

  // ── 8. By company size bucket ───────────────────────────────────────
  printDistribution("BY COMPANY SIZE (paragraph count)", buildDist(a => {
    const n = companyParaCount.get(a.companyName) ?? 0;
    if (n <= 3) return "1-3 paras";
    if (n <= 6) return "4-6 paras";
    if (n <= 10) return "7-10 paras";
    if (n <= 20) return "11-20 paras";
    return "21+ paras";
  }));

  // ── 9. Cross-tab: category × segment ────────────────────────────────
  const catBySegment = new Map<string, Map<string, number>>();
  for (const a of analyzed) {
    const key = a.majorityCat;
    let row = catBySegment.get(key);
    if (!row) { row = new Map(); catBySegment.set(key, row); }
    row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
  }
  printCrossTab("CATEGORY × SEGMENT", catBySegment, ["unanimous", "majority", "unresolved"]);

  // ── 10. Cross-tab: specificity × segment ────────────────────────────
  const specBySegment = new Map<string, Map<string, number>>();
  for (const a of analyzed) {
    const key = specLabels[a.majoritySpec] ?? String(a.majoritySpec);
    let row = specBySegment.get(key);
    if (!row) { row = new Map(); specBySegment.set(key, row); }
    row.set(a.segment, (row.get(a.segment) ?? 0) + 1);
  }
  printCrossTab("SPECIFICITY × SEGMENT", specBySegment, ["unanimous", "majority", "unresolved"]);

  // ── 11. Cross-tab: fiscal year × category (for non-unanimous) ──────
  const yearByCat = new Map<string, Map<string, number>>();
  for (const a of analyzed) {
    if (a.segment === "unanimous") continue;
    const key = String(a.fiscalYear);
    let row = yearByCat.get(key);
    if (!row) { row = new Map(); yearByCat.set(key, row); }
    row.set(a.majorityCat, (row.get(a.majorityCat) ?? 0) + 1);
  }
  printCrossTab("FISCAL YEAR × CATEGORY (non-unanimous only)", yearByCat);

  // ── 12. Top disagreement companies ──────────────────────────────────
  const companyDisagree = new Map<string, { total: number; disputed: number }>();
  for (const a of analyzed) {
    let entry = companyDisagree.get(a.companyName);
    if (!entry) { entry = { total: 0, disputed: 0 }; companyDisagree.set(a.companyName, entry); }
    entry.total++;
    if (a.segment !== "unanimous") entry.disputed++;
  }

  console.log(`\n${"═".repeat(70)}`);
  console.log("  TOP 30 COMPANIES BY DISAGREEMENT RATE (min 5 paragraphs)");
  console.log(`${"═".repeat(70)}`);
  const companyRanked = [...companyDisagree.entries()]
    .filter(([, v]) => v.total >= 5)
    .map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
    .sort((a, b) => b.rate - a.rate)
    .slice(0, 30);

  for (const c of companyRanked) {
    console.log(`  ${c.name.slice(0, 45).padEnd(45)}  ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
  }

  // ── 13. Bottom 30 companies (lowest disagreement) ──────────────────
  console.log(`\n${"═".repeat(70)}`);
  console.log("  TOP 30 COMPANIES BY AGREEMENT RATE (min 5 paragraphs)");
  console.log(`${"═".repeat(70)}`);
  const companyAgreed = [...companyDisagree.entries()]
    .filter(([, v]) => v.total >= 5)
    .map(([name, v]) => ({ name, ...v, rate: v.disputed / v.total }))
    .sort((a, b) => a.rate - b.rate)
    .slice(0, 30);

  for (const c of companyAgreed) {
    console.log(`  ${c.name.slice(0, 45).padEnd(45)}  ${c.disputed}/${c.total} disputed (${pct(c.disputed, c.total)})`);
  }

  // ── 14. Specificity spread analysis ─────────────────────────────────
  console.log(`\n${"═".repeat(70)}`);
  console.log("  SPECIFICITY SPREAD (max - min) FOR NON-UNANIMOUS");
  console.log(`${"═".repeat(70)}`);
  const specSpread = new Map<string, number>();
  for (const a of analyzed) {
    if (a.specSegment === "spec-unanimous") continue;
    const spread = Math.max(...a.specs) - Math.min(...a.specs);
    const key = `spread-${spread}`;
    specSpread.set(key, (specSpread.get(key) ?? 0) + 1);
  }
  for (const [key, count] of [...specSpread.entries()].sort()) {
    console.log(`  ${key}: ${count.toLocaleString()}`);
  }

  // ── 15. Most common category dispute pairs ──────────────────────────
  console.log(`\n${"═".repeat(70)}`);
  console.log("  TOP CATEGORY DISPUTE PAIRS");
  console.log(`${"═".repeat(70)}`);
  const catPairs = new Map<string, number>();
  for (const a of analyzed) {
    if (a.catSegment === "cat-unanimous") continue;
    const sorted = [...new Set(a.cats)].sort();
    const key = sorted.join(" ↔ ");
    catPairs.set(key, (catPairs.get(key) ?? 0) + 1);
  }
  for (const [pair, count] of [...catPairs.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
    console.log(`  ${pair.padEnd(55)} ${count.toLocaleString()}`);
  }

  // ── 16. Spec dispute patterns ───────────────────────────────────────
  console.log(`\n${"═".repeat(70)}`);
  console.log("  TOP SPECIFICITY DISPUTE PATTERNS");
  console.log(`${"═".repeat(70)}`);
  const specPatterns = new Map<string, number>();
  for (const a of analyzed) {
    if (a.specSegment === "spec-unanimous") continue;
    const sorted = [...a.specs].sort((a, b) => a - b);
    const key = `[${sorted.join(",")}]`;
    specPatterns.set(key, (specPatterns.get(key) ?? 0) + 1);
  }
  for (const [pattern, count] of [...specPatterns.entries()].sort(([, a], [, b]) => b - a).slice(0, 15)) {
    console.log(`  ${pattern.padEnd(20)} ${count.toLocaleString()}`);
  }

  // ── 17. Confidence vs agreement rate ────────────────────────────────
  console.log(`\n${"═".repeat(70)}`);
  console.log("  AVERAGE CONFIDENCE BY SEGMENT");
  console.log(`${"═".repeat(70)}`);
  const confScore = (c: string) => c === "high" ? 3 : c === "medium" ? 2 : 1;
  for (const seg of ["unanimous", "majority", "unresolved"] as const) {
    const group = analyzed.filter(a => a.segment === seg);
    const avgCatConf = group.reduce((s, a) => s + a.catConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
    const avgSpecConf = group.reduce((s, a) => s + a.specConfidences.reduce((s2, c) => s2 + confScore(c), 0) / 3, 0) / group.length;
    console.log(`  ${seg.padEnd(12)}  avg cat conf: ${avgCatConf.toFixed(2)}  avg spec conf: ${avgSpecConf.toFixed(2)}`);
  }

  // ── 18. All-low-confidence counts ───────────────────────────────────
  console.log(`\n${"═".repeat(70)}`);
  console.log("  ALL-LOW-CONFIDENCE PATTERNS");
  console.log(`${"═".repeat(70)}`);
  const allLowCat = analyzed.filter(a => a.catConfidences.every(c => c === "low"));
  const allLowSpec = analyzed.filter(a => a.specConfidences.every(c => c === "low"));
  const allLowBoth = analyzed.filter(a => a.catConfidences.every(c => c === "low") && a.specConfidences.every(c => c === "low"));
  console.log(`  All-low cat confidence:  ${allLowCat.length} (${pct(allLowCat.length, analyzed.length)})`);
  console.log(`  All-low spec confidence: ${allLowSpec.length} (${pct(allLowSpec.length, analyzed.length)})`);
  console.log(`  All-low both:            ${allLowBoth.length} (${pct(allLowBoth.length, analyzed.length)})`);

  // Of those, segment distribution
  for (const [label, group] of [["All-low cat", allLowCat], ["All-low spec", allLowSpec]] as const) {
    const segDist = { unanimous: 0, majority: 0, unresolved: 0 };
    for (const a of group) segDist[a.segment]++;
    console.log(`  ${label} → unanimous: ${segDist.unanimous}, majority: ${segDist.majority}, unresolved: ${segDist.unresolved}`);
  }
}

main().catch(console.error);