SEC-cyBERT/ts/scripts/sample-disputes.ts

/**
 * Sample and print full paragraph text for the hardest dispute types.
 *
 * Prints 5 paragraphs from each of 4 dispute categories (20 total),
 * with all 3 model annotations and company metadata.
 *
 * Usage: bun ts/scripts/sample-disputes.ts
 */
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";

const PARAGRAPHS = new URL("../../data/paragraphs/paragraphs-clean.jsonl", import.meta.url).pathname;
const ANNOTATIONS = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;

// ── Types ──────────────────────────────────────────────────────────────
interface Ann {
  paragraphId: string;
  label: {
    content_category: string;
    specificity_level: number;
    category_confidence: string;
    specificity_confidence: string;
    reasoning: string;
  };
  provenance: {
    modelId: string;
  };
}

type Paragraph = (typeof Paragraph)["_output"];

// ── Helpers ────────────────────────────────────────────────────────────
function majority<T>(arr: T[]): { value: T; count: number } | null {
  const counts = new Map<T, number>();
  for (const v of arr) counts.set(v, (counts.get(v) ?? 0) + 1);
  let best: T | null = null;
  let bestCount = 0;
  for (const [v, c] of counts) {
    if (c > bestCount) { best = v; bestCount = c; }
  }
  return best !== null && bestCount >= 2 ? { value: best, count: bestCount } : null;
}

/** Deterministic sample: sort by ID, then take every Nth to get `count` items from diverse companies. */
function deterministicSample(
  candidates: { id: string; companyName: string }[],
  count: number,
): typeof candidates {
  if (candidates.length <= count) return candidates;

  // Sort by ID for determinism
  const sorted = [...candidates].sort((a, b) => a.id.localeCompare(b.id));

  // Greedily pick every Nth, skipping if company already seen (for diversity)
  const step = Math.floor(sorted.length / (count * 3)); // oversample to allow skipping
  const picked: typeof candidates = [];
  const seenCompanies = new Set<string>();

  for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
    const item = sorted[i];
    if (!seenCompanies.has(item.companyName)) {
      picked.push(item);
      seenCompanies.add(item.companyName);
    }
  }

  // If we couldn't get enough diverse companies, fill from remaining
  if (picked.length < count) {
    for (let i = 0; i < sorted.length && picked.length < count; i += Math.max(1, step)) {
      if (!picked.includes(sorted[i])) {
        picked.push(sorted[i]);
      }
    }
  }

  return picked.slice(0, count);
}

function shortModel(modelId: string): string {
  // "google/gemini-3.1-flash-lite-preview" → "gemini-3.1-flash-lite"
  const name = modelId.split("/").pop() ?? modelId;
  return name.replace(/-preview$/, "").slice(0, 30);
}

function printSample(
  para: Paragraph,
  anns: Ann[],
  index: number,
) {
  console.log(`\n${"─".repeat(80)}`);
  console.log(`  [${index}] ${para.filing.companyName} (${para.filing.ticker}) — ${para.filing.filingType} ${para.filing.filingDate}`);
  console.log(`       Paragraph ${para.paragraphIndex + 1}, ${para.wordCount} words, ID: ${para.id}`);
  console.log(`${"─".repeat(80)}`);
  console.log();
  console.log(para.text);
  console.log();

  for (const a of anns) {
    const model = shortModel(a.provenance.modelId);
    console.log(`  ┌─ ${model}`);
    console.log(`  │  Category: ${a.label.content_category} (${a.label.category_confidence})`);
    console.log(`  │  Specificity: ${a.label.specificity_level} (${a.label.specificity_confidence})`);
    console.log(`  │  Reasoning: ${a.label.reasoning}`);
    console.log(`  └─`);
  }
}

// ── Main ───────────────────────────────────────────────────────────────
async function main() {
  console.log("Loading paragraphs...");
  const { records: paragraphs } = await readJsonl(PARAGRAPHS, Paragraph);
  console.log(`  ${paragraphs.length.toLocaleString()} paragraphs`);

  const paraById = new Map(paragraphs.map(p => [p.id, p]));

  console.log("Loading annotations...");
  const { records: rawAnns, skipped } = await readJsonlRaw(ANNOTATIONS);
  const anns = rawAnns as Ann[];
  console.log(`  ${anns.length.toLocaleString()} annotations (${skipped} skipped)`);

  // Group annotations by paragraph
  const byParagraph = new Map<string, Ann[]>();
  for (const a of anns) {
    let arr = byParagraph.get(a.paragraphId);
    if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
    arr.push(a);
  }

  // Count paragraphs per company
  const companyParaCount = new Map<string, number>();
  for (const p of paragraphs) {
    const name = p.filing.companyName;
    companyParaCount.set(name, (companyParaCount.get(name) ?? 0) + 1);
  }

  // ── Build candidate pools ────────────────────────────────────────────

  // Pool 1: Spec [3,4] disputes — specs are [3,3,4] or [3,4,4]
  const specDisputes: { id: string; companyName: string }[] = [];

  // Pool 2: Management ↔ RMP disputes — 2 say one, 1 says the other
  const mgmtRmpDisputes: { id: string; companyName: string }[] = [];

  // Pool 3: None/Other ↔ Strategy disputes
  const noneStrategyDisputes: { id: string; companyName: string }[] = [];

  // Pool 4: Small company (1-3 paras) + unresolved (3-way cat split)
  const smallUnresolved: { id: string; companyName: string }[] = [];

  for (const [pid, pannAnns] of byParagraph) {
    if (pannAnns.length < 3) continue;
    const para = paraById.get(pid);
    if (!para) continue;

    const cats = pannAnns.map(a => a.label.content_category);
    const specs = pannAnns.map(a => a.label.specificity_level);
    const sortedSpecs = [...specs].sort((a, b) => a - b);
    const sortedCats = [...cats].sort();

    const companyName = para.filing.companyName;

    // Pool 1: Spec [3,3,4] or [3,4,4]
    const specKey = sortedSpecs.join(",");
    if (specKey === "3,3,4" || specKey === "3,4,4") {
      specDisputes.push({ id: pid, companyName });
    }

    // Pool 2: Management ↔ RMP (2-1 split in either direction)
    const mgmtCount = cats.filter(c => c === "Management Role").length;
    const rmpCount = cats.filter(c => c === "Risk Management Process").length;
    if ((mgmtCount === 2 && rmpCount === 1) || (mgmtCount === 1 && rmpCount === 2)) {
      mgmtRmpDisputes.push({ id: pid, companyName });
    }

    // Pool 3: None/Other ↔ Strategy Integration (2-1 split in either direction)
    const noneCount = cats.filter(c => c === "None/Other").length;
    const stratCount = cats.filter(c => c === "Strategy Integration").length;
    if ((noneCount === 2 && stratCount === 1) || (noneCount === 1 && stratCount === 2)) {
      noneStrategyDisputes.push({ id: pid, companyName });
    }

    // Pool 4: Small company (1-3 paras) with 3-way cat split (unresolved)
    const paraCount = companyParaCount.get(companyName) ?? 0;
    const uniqueCats = new Set(cats);
    if (paraCount <= 3 && uniqueCats.size === 3) {
      smallUnresolved.push({ id: pid, companyName });
    }
  }

  console.log(`\nCandidate pools:`);
  console.log(`  Spec [3,4] disputes:          ${specDisputes.length}`);
  console.log(`  Management ↔ RMP disputes:     ${mgmtRmpDisputes.length}`);
  console.log(`  None/Other ↔ Strategy disputes: ${noneStrategyDisputes.length}`);
  console.log(`  Small co. unresolved:          ${smallUnresolved.length}`);

  // ── Sample and print ────────────────────────────────────────────────

  const sections: [string, { id: string; companyName: string }[]][] = [
    ["SPEC [3,4] DISPUTES — Models can't decide firm-specific vs quantified-verifiable", specDisputes],
    ["MANAGEMENT ↔ RMP DISPUTES — 2-vs-1 split between Management Role and Risk Management Process", mgmtRmpDisputes],
    ["NONE/OTHER ↔ STRATEGY INTEGRATION DISPUTES — 2-vs-1 split between None/Other and Strategy Integration", noneStrategyDisputes],
    ["SMALL COMPANY (1-3 PARAS) UNRESOLVED — 3-way category split, tiny filings", smallUnresolved],
  ];

  let globalIdx = 1;
  for (const [title, pool] of sections) {
    console.log(`\n${"═".repeat(80)}`);
    console.log(`  ${title}`);
    console.log(`${"═".repeat(80)}`);

    const sampled = deterministicSample(pool, 5);
    if (sampled.length === 0) {
      console.log("\n  (no candidates found)");
      continue;
    }

    for (const item of sampled) {
      const para = paraById.get(item.id)!;
      const pannAnns = byParagraph.get(item.id)!;
      printSample(para, pannAnns, globalIdx++);
    }
  }

  console.log(`\n${"═".repeat(80)}`);
  console.log(`  Done — ${globalIdx - 1} paragraphs printed.`);
  console.log(`${"═".repeat(80)}`);
}

main().catch(console.error);