SEC-cyBERT/ts/scripts/dispute-crosstab.ts

/**
 * Detailed cross-tabulations for disputed (non-unanimous) paragraphs.
 *
 * Usage: bun ts/scripts/dispute-crosstab.ts
 */
import { readJsonlRaw, readJsonl } from "../src/lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";

const ANN_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
const PARA_PATH = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;

interface Ann {
  paragraphId: string;
  label: {
    content_category: string;
    specificity_level: number;
    category_confidence: string;
    specificity_confidence: string;
    reasoning: string;
  };
  provenance: {
    modelId: string;
    costUsd: number;
    inputTokens: number;
    outputTokens: number;
    reasoningTokens: number;
    latencyMs: number;
    requestedAt: string;
  };
}

// ── Helpers ────────────────────────────────────────────────────────────
function pct(n: number, total: number): string {
  if (total === 0) return "0.0%";
  return `${((n / total) * 100).toFixed(1)}%`;
}

function median(arr: number[]): number {
  if (arr.length === 0) return 0;
  const sorted = [...arr].sort((a, b) => a - b);
  const mid = Math.floor(sorted.length / 2);
  return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
}

function percentile(arr: number[], p: number): number {
  if (arr.length === 0) return 0;
  const sorted = [...arr].sort((a, b) => a - b);
  const idx = (p / 100) * (sorted.length - 1);
  const lo = Math.floor(idx);
  const hi = Math.ceil(idx);
  return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
}

function majority<T>(arr: T[]): T | null {
  const freq = new Map<T, number>();
  for (const v of arr) freq.set(v, (freq.get(v) ?? 0) + 1);
  for (const [val, count] of freq) {
    if (count >= 2) return val;
  }
  return null;
}

function sortedVals(arr: number[]): string {
  return `[${[...arr].sort((a, b) => a - b).join(",")}]`;
}

function uniqueSorted(arr: string[]): string[] {
  return [...new Set(arr)].sort();
}

// ── Main ──────────────────────────────────────────────────────────────
async function main() {
  console.log("Loading data...");
  const [{ records: rawAnns, skipped: annSkipped }, { records: paragraphs, skipped: paraSkipped }] =
    await Promise.all([
      readJsonlRaw(ANN_PATH),
      readJsonl(PARA_PATH, Paragraph),
    ]);

  const anns = rawAnns as Ann[];
  console.log(`  ${anns.length.toLocaleString()} annotations (${annSkipped} skipped)`);
  console.log(`  ${paragraphs.length.toLocaleString()} paragraphs (${paraSkipped} skipped)\n`);

  // Index paragraphs by id
  const paraById = new Map(paragraphs.map(p => [p.id, p]));

  // Group annotations by paragraph
  const byParagraph = new Map<string, Ann[]>();
  for (const a of anns) {
    let arr = byParagraph.get(a.paragraphId);
    if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
    arr.push(a);
  }

  // Classify each paragraph
  interface ParaInfo {
    pid: string;
    cats: string[];
    specs: number[];
    catUnanimous: boolean;
    specUnanimous: boolean;
    majCat: string | null;
    majSpec: number | null;
    catDisputed: boolean;
    specDisputed: boolean;
    disputeType: "none" | "cat-only" | "spec-only" | "both";
    wordCount: number;
  }

  const allParas: ParaInfo[] = [];
  for (const [pid, panns] of byParagraph) {
    if (panns.length !== 3) continue;
    const cats = panns.map(a => a.label.content_category);
    const specs = panns.map(a => a.label.specificity_level);
    const catU = new Set(cats).size === 1;
    const specU = new Set(specs).size === 1;
    const majCat = majority(cats);
    const majSpec = majority(specs);
    const catDisputed = !catU;
    const specDisputed = !specU;
    let disputeType: ParaInfo["disputeType"] = "none";
    if (catDisputed && !specDisputed) disputeType = "cat-only";
    else if (!catDisputed && specDisputed) disputeType = "spec-only";
    else if (catDisputed && specDisputed) disputeType = "both";

    const para = paraById.get(pid);
    allParas.push({
      pid,
      cats,
      specs,
      catUnanimous: catU,
      specUnanimous: specU,
      majCat,
      majSpec,
      catDisputed,
      specDisputed,
      disputeType,
      wordCount: para?.wordCount ?? 0,
    });
  }

  const disputed = allParas.filter(p => p.disputeType !== "none");
  const catOnly = allParas.filter(p => p.disputeType === "cat-only");
  const specOnly = allParas.filter(p => p.disputeType === "spec-only");
  const bothDisputed = allParas.filter(p => p.disputeType === "both");

  console.log("═══════════════════════════════════════════════════════════════════");
  console.log("  DISPUTE CROSS-TABULATION ANALYSIS");
  console.log("═══════════════════════════════════════════════════════════════════");
  console.log(`  Total paragraphs (3-annotator): ${allParas.length.toLocaleString()}`);
  console.log(`  Disputed (not both-unanimous):  ${disputed.length.toLocaleString()} (${pct(disputed.length, allParas.length)})`);
  console.log(`    Cat-only:  ${catOnly.length.toLocaleString()}`);
  console.log(`    Spec-only: ${specOnly.length.toLocaleString()}`);
  console.log(`    Both:      ${bothDisputed.length.toLocaleString()}`);

  // ════════════════════════════════════════════════════════════════════════
  // 1. CATEGORY x SPECIFICITY CROSS-TAB FOR DISPUTED PARAGRAPHS
  // ════════════════════════════════════════════════════════════════════════
  console.log("\n\n══════════════════════════════════════════════════════════════");
  console.log("  1. CATEGORY x SPECIFICITY CROSS-TAB (disputed paragraphs)");
  console.log("══════════════════════════════════════════════════════════════");
  console.log("  Uses majority-vote labels for both axes.\n");

  // Collect all categories from majority votes
  const catCounts = new Map<string, number>();
  for (const p of disputed) {
    if (p.majCat) catCounts.set(p.majCat, (catCounts.get(p.majCat) ?? 0) + 1);
  }
  const categories = [...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([c]) => c);
  const specLevels = [1, 2, 3, 4];
  const specLabels = ["GenBoiler", "SectorAdpt", "FirmSpec", "QuantVerif"];

  // Build the cross-tab
  const crossTab = new Map<string, number>();
  let noMajCat = 0, noMajSpec = 0;
  for (const p of disputed) {
    if (!p.majCat || p.majSpec === null) {
      if (!p.majCat) noMajCat++;
      if (p.majSpec === null) noMajSpec++;
      continue;
    }
    const key = `${p.majCat}|${p.majSpec}`;
    crossTab.set(key, (crossTab.get(key) ?? 0) + 1);
  }

  // Print matrix
  const colW = 12;
  const catW = 28;
  let header = "Category".padEnd(catW);
  for (let i = 0; i < specLevels.length; i++) {
    header += `${specLevels[i]}:${specLabels[i]}`.padStart(colW);
  }
  header += "Total".padStart(colW);
  console.log(`  ${header}`);
  console.log(`  ${"─".repeat(header.length)}`);

  for (const cat of categories) {
    let rowTotal = 0;
    const cells: string[] = [];
    for (const s of specLevels) {
      const v = crossTab.get(`${cat}|${s}`) ?? 0;
      rowTotal += v;
      cells.push(v.toString());
    }
    let row = cat.padEnd(catW);
    for (let i = 0; i < cells.length; i++) {
      const v = parseInt(cells[i]);
      const rowPct = rowTotal > 0 ? ((v / rowTotal) * 100).toFixed(0) : "0";
      row += `${v} (${rowPct}%)`.padStart(colW);
    }
    row += `${rowTotal}`.padStart(colW);
    console.log(`  ${row}`);
  }
  console.log(`\n  (${noMajCat} paragraphs had no majority category, ${noMajSpec} had no majority specificity)`);

  // ════════════════════════════════════════════════════════════════════════
  // 2. DISPUTE TYPE BY CATEGORY
  // ════════════════════════════════════════════════════════════════════════
  console.log("\n\n══════════════════════════════════════════════════════════════");
  console.log("  2. DISPUTE TYPE BY MAJORITY CATEGORY");
  console.log("══════════════════════════════════════════════════════════════");
  console.log("  For each majority category, % of disputes that are cat-only, spec-only, or both.\n");

  // Group disputed paragraphs by majority category
  const disputeByCat = new Map<string, { catOnly: number; specOnly: number; both: number }>();
  for (const p of disputed) {
    const cat = p.majCat ?? "[no majority]";
    if (!disputeByCat.has(cat)) disputeByCat.set(cat, { catOnly: 0, specOnly: 0, both: 0 });
    const entry = disputeByCat.get(cat)!;
    if (p.disputeType === "cat-only") entry.catOnly++;
    else if (p.disputeType === "spec-only") entry.specOnly++;
    else if (p.disputeType === "both") entry.both++;
  }

  const dHeader = "Category".padEnd(catW) + "n".padStart(8) + "Cat-only".padStart(12) + "Spec-only".padStart(12) + "Both".padStart(12);
  console.log(`  ${dHeader}`);
  console.log(`  ${"─".repeat(dHeader.length)}`);

  const sortedDispCats = [...disputeByCat.entries()].sort((a, b) => {
    const totalA = a[1].catOnly + a[1].specOnly + a[1].both;
    const totalB = b[1].catOnly + b[1].specOnly + b[1].both;
    return totalB - totalA;
  });
  for (const [cat, d] of sortedDispCats) {
    const total = d.catOnly + d.specOnly + d.both;
    const row = cat.padEnd(catW) +
      total.toString().padStart(8) +
      `${d.catOnly} (${pct(d.catOnly, total)})`.padStart(12) +
      `${d.specOnly} (${pct(d.specOnly, total)})`.padStart(12) +
      `${d.both} (${pct(d.both, total)})`.padStart(12);
    console.log(`  ${row}`);
  }

  // ════════════════════════════════════════════════════════════════════════
  // 3. SPEC BOUNDARY DISPUTES BY CATEGORY
  // ════════════════════════════════════════════════════════════════════════
  console.log("\n\n══════════════════════════════════════════════════════════════");
  console.log("  3. SPECIFICITY BOUNDARY DISPUTES BY CATEGORY");
  console.log("══════════════════════════════════════════════════════════════");
  console.log("  For spec-disputed paragraphs, the spec vote pattern by majority category.\n");

  // Group by majority category, then count spec patterns
  const specPatternByCat = new Map<string, Map<string, number>>();
  const specDisputed = allParas.filter(p => p.specDisputed);
  for (const p of specDisputed) {
    const cat = p.majCat ?? "[no majority]";
    if (!specPatternByCat.has(cat)) specPatternByCat.set(cat, new Map());
    const patternMap = specPatternByCat.get(cat)!;
    // Show the unique values sorted as the boundary pattern
    const uniqSorted = [...new Set(p.specs)].sort((a, b) => a - b);
    const pattern = `[${uniqSorted.join(",")}]`;
    patternMap.set(pattern, (patternMap.get(pattern) ?? 0) + 1);
  }

  // Collect all patterns
  const allPatterns = new Set<string>();
  for (const pm of specPatternByCat.values()) {
    for (const pat of pm.keys()) allPatterns.add(pat);
  }
  const sortedPatterns = [...allPatterns].sort();

  // Print header
  const patW = 10;
  let pHeader = "Category".padEnd(catW) + "n".padStart(6);
  for (const pat of sortedPatterns) {
    pHeader += pat.padStart(patW);
  }
  console.log(`  ${pHeader}`);
  console.log(`  ${"─".repeat(pHeader.length)}`);

  const specPatCats = [...specPatternByCat.entries()].sort((a, b) => {
    let totalA = 0, totalB = 0;
    for (const v of a[1].values()) totalA += v;
    for (const v of b[1].values()) totalB += v;
    return totalB - totalA;
  });
  for (const [cat, pm] of specPatCats) {
    let total = 0;
    for (const v of pm.values()) total += v;
    let row = cat.padEnd(catW) + total.toString().padStart(6);
    for (const pat of sortedPatterns) {
      const v = pm.get(pat) ?? 0;
      if (v === 0) {
        row += "-".padStart(patW);
      } else {
        row += `${v}`.padStart(patW);
      }
    }
    console.log(`  ${row}`);
  }

  // Also show with percentages within each category
  console.log("\n  (Row percentages:)");
  let pHeader2 = "Category".padEnd(catW) + "n".padStart(6);
  for (const pat of sortedPatterns) {
    pHeader2 += pat.padStart(patW);
  }
  console.log(`  ${pHeader2}`);
  console.log(`  ${"─".repeat(pHeader2.length)}`);
  for (const [cat, pm] of specPatCats) {
    let total = 0;
    for (const v of pm.values()) total += v;
    let row = cat.padEnd(catW) + total.toString().padStart(6);
    for (const pat of sortedPatterns) {
      const v = pm.get(pat) ?? 0;
      if (v === 0) {
        row += "-".padStart(patW);
      } else {
        row += `${((v / total) * 100).toFixed(0)}%`.padStart(patW);
      }
    }
    console.log(`  ${row}`);
  }

  // ════════════════════════════════════════════════════════════════════════
  // 4. WORD COUNT DISTRIBUTION BY DISPUTE TYPE
  // ════════════════════════════════════════════════════════════════════════
  console.log("\n\n══════════════════════════════════════════════════════════════");
  console.log("  4. WORD COUNT DISTRIBUTION BY DISPUTE TYPE");
  console.log("══════════════════════════════════════════════════════════════\n");

  const groups: { label: string; paras: ParaInfo[] }[] = [
    { label: "Unanimous (no dispute)", paras: allParas.filter(p => p.disputeType === "none") },
    { label: "Cat-only dispute", paras: catOnly },
    { label: "Spec-only dispute", paras: specOnly },
    { label: "Both disputed", paras: bothDisputed },
  ];

  const wcHeader = "Dispute Type".padEnd(28) + "n".padStart(8) + "Median".padStart(10) + "P90".padStart(10) + "P10".padStart(10) + "Mean".padStart(10);
  console.log(`  ${wcHeader}`);
  console.log(`  ${"─".repeat(wcHeader.length)}`);

  for (const g of groups) {
    const wcs = g.paras.map(p => p.wordCount).filter(w => w > 0);
    if (wcs.length === 0) continue;
    const row = g.label.padEnd(28) +
      wcs.length.toString().padStart(8) +
      median(wcs).toFixed(0).padStart(10) +
      percentile(wcs, 90).toFixed(0).padStart(10) +
      percentile(wcs, 10).toFixed(0).padStart(10) +
      (wcs.reduce((a, b) => a + b, 0) / wcs.length).toFixed(0).padStart(10);
    console.log(`  ${row}`);
  }

  // ════════════════════════════════════════════════════════════════════════
  // 5. UNRESOLVED PARAGRAPH ANALYSIS (3-WAY SPLITS)
  // ════════════════════════════════════════════════════════════════════════
  console.log("\n\n══════════════════════════════════════════════════════════════");
  console.log("  5. UNRESOLVED PARAGRAPH ANALYSIS (3-way category splits)");
  console.log("══════════════════════════════════════════════════════════════\n");

  const unresolved = allParas.filter(p => p.majCat === null);
  console.log(`  Total unresolved paragraphs: ${unresolved.length.toLocaleString()}`);

  // Category representations in unresolved
  const unresolvedCatFreq = new Map<string, number>();
  for (const p of unresolved) {
    for (const c of p.cats) {
      unresolvedCatFreq.set(c, (unresolvedCatFreq.get(c) ?? 0) + 1);
    }
  }
  console.log("\n  Categories appearing in unresolved paragraphs (annotation count):");
  const sortedUnresCats = [...unresolvedCatFreq.entries()].sort((a, b) => b[1] - a[1]);
  for (const [cat, count] of sortedUnresCats) {
    console.log(`    ${count.toString().padStart(6)}  ${cat}`);
  }

  // Specificity in unresolved
  const unresolvedSpecFreq = new Map<number, number>();
  for (const p of unresolved) {
    for (const s of p.specs) {
      unresolvedSpecFreq.set(s, (unresolvedSpecFreq.get(s) ?? 0) + 1);
    }
  }
  console.log("\n  Specificity levels in unresolved paragraphs (annotation count):");
  for (let s = 1; s <= 4; s++) {
    const count = unresolvedSpecFreq.get(s) ?? 0;
    console.log(`    ${count.toString().padStart(6)}  ${s}`);
  }

  // Most common 3-way category splits
  const threewayPatterns = new Map<string, number>();
  for (const p of unresolved) {
    const sorted = [...p.cats].sort();
    const key = sorted.join(" / ");
    threewayPatterns.set(key, (threewayPatterns.get(key) ?? 0) + 1);
  }

  console.log("\n  Most common 3-way category splits:");
  const sortedThreeWay = [...threewayPatterns.entries()].sort((a, b) => b[1] - a[1]);
  for (const [pattern, count] of sortedThreeWay.slice(0, 20)) {
    console.log(`    ${count.toString().padStart(6)}  ${pattern}`);
  }
  if (sortedThreeWay.length > 20) {
    console.log(`    ... and ${sortedThreeWay.length - 20} more patterns`);
  }

  // Specificity agreement among unresolved
  const unresolvedSpecUnanimous = unresolved.filter(p => p.specUnanimous).length;
  const unresolvedSpecMaj = unresolved.filter(p => p.majSpec !== null).length;
  console.log(`\n  Specificity agreement among unresolved:`);
  console.log(`    Spec unanimous: ${unresolvedSpecUnanimous} (${pct(unresolvedSpecUnanimous, unresolved.length)})`);
  console.log(`    Spec majority:  ${unresolvedSpecMaj} (${pct(unresolvedSpecMaj, unresolved.length)})`);
  console.log(`    Spec 3-way:     ${unresolved.length - unresolvedSpecMaj} (${pct(unresolved.length - unresolvedSpecMaj, unresolved.length)})`);

  // ════════════════════════════════════════════════════════════════════════
  // 6. "BOTH" DISPUTES — COMBINED PATTERNS
  // ════════════════════════════════════════════════════════════════════════
  console.log("\n\n══════════════════════════════════════════════════════════════");
  console.log("  6. 'BOTH' DISPUTES — COMBINED CATEGORY + SPECIFICITY PATTERNS");
  console.log("══════════════════════════════════════════════════════════════\n");

  console.log(`  Total paragraphs with both cat AND spec disputed: ${bothDisputed.length.toLocaleString()}\n`);

  // For each, compute the category dispute pair + spec boundary
  const combinedPatterns = new Map<string, number>();
  for (const p of bothDisputed) {
    // Category dispute description
    const catUniq = uniqueSorted(p.cats);
    let catPart: string;
    if (catUniq.length === 2) {
      // 2-1 split: show as "A<->B"
      catPart = `${catUniq[0]}↔${catUniq[1]}`;
    } else {
      // 3-way
      catPart = catUniq.join("/");
    }

    // Spec dispute description
    const specUniq = [...new Set(p.specs)].sort((a, b) => a - b);
    const specPart = `[${specUniq.join(",")}]`;

    const combined = `${catPart} + ${specPart}`;
    combinedPatterns.set(combined, (combinedPatterns.get(combined) ?? 0) + 1);
  }

  const sortedCombined = [...combinedPatterns.entries()].sort((a, b) => b[1] - a[1]);
  console.log("  Top 30 combined dispute patterns:");
  for (const [pattern, count] of sortedCombined.slice(0, 30)) {
    console.log(`    ${count.toString().padStart(6)}  ${pattern}`);
  }
  if (sortedCombined.length > 30) {
    console.log(`\n    ... and ${sortedCombined.length - 30} more patterns (${sortedCombined.slice(30).reduce((a, b) => a + b[1], 0)} paragraphs)`);
  }

  // Also summarize by just the category pair (aggregating across spec patterns)
  console.log("\n  Category dispute pairs (aggregated across spec patterns):");
  const catPairAgg = new Map<string, number>();
  for (const p of bothDisputed) {
    const catUniq = uniqueSorted(p.cats);
    let catPart: string;
    if (catUniq.length === 2) {
      catPart = `${catUniq[0]}↔${catUniq[1]}`;
    } else {
      catPart = catUniq.join("/");
    }
    catPairAgg.set(catPart, (catPairAgg.get(catPart) ?? 0) + 1);
  }
  const sortedCatPairs = [...catPairAgg.entries()].sort((a, b) => b[1] - a[1]);
  for (const [pair, count] of sortedCatPairs.slice(0, 20)) {
    console.log(`    ${count.toString().padStart(6)}  ${pair}`);
  }

  console.log("\n  Spec boundary patterns within 'both' disputes:");
  const specPatAgg = new Map<string, number>();
  for (const p of bothDisputed) {
    const specUniq = [...new Set(p.specs)].sort((a, b) => a - b);
    const specPart = `[${specUniq.join(",")}]`;
    specPatAgg.set(specPart, (specPatAgg.get(specPart) ?? 0) + 1);
  }
  const sortedSpecPats = [...specPatAgg.entries()].sort((a, b) => b[1] - a[1]);
  for (const [pat, count] of sortedSpecPats) {
    console.log(`    ${count.toString().padStart(6)}  ${pat}  (${pct(count, bothDisputed.length)})`);
  }

  console.log("\n═══════════════════════════════════════════════════════════════════");
  console.log("  ANALYSIS COMPLETE");
  console.log("═══════════════════════════════════════════════════════════════════");
}

main().catch(err => { console.error(err); process.exit(1); });