SEC-cyBERT/ts/src/analyze/dedup-analysis.ts

/**
 * Deduplication analysis: hash-based detection of identical/near-identical paragraphs.
 * Tracks cross-filing and cross-year persistence of boilerplate text.
 */
import { readJsonl } from "../lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
import type { Paragraph as ParagraphType } from "@sec-cybert/schemas/paragraph.ts";
import { writeFile, mkdir } from "node:fs/promises";
import { dirname } from "node:path";

const DATA = "../data";
const PARAGRAPHS_PATH = `${DATA}/paragraphs/paragraphs.jsonl`;

function round(n: number, d = 2): number {
  return Math.round(n * 10 ** d) / 10 ** d;
}

interface HashGroup {
  hash: string;
  count: number;
  companies: Set<string>;
  filings: Set<string>;
  fiscalYears: Set<number>;
  sampleText: string;
  sampleCompanies: string[];
}

async function main() {
  process.stderr.write("  Loading paragraphs...\n");
  const { records: paragraphs, skipped } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
  if (skipped > 0) process.stderr.write(`  Warning: ${skipped} invalid lines skipped\n`);
  process.stderr.write(`  Loaded ${paragraphs.length} paragraphs\n\n`);

  // Group by textHash
  const hashGroups = new Map<string, { paragraphs: ParagraphType[] }>();
  for (const p of paragraphs) {
    const existing = hashGroups.get(p.textHash) ?? { paragraphs: [] };
    existing.paragraphs.push(p);
    hashGroups.set(p.textHash, existing);
  }

  // Analyze each group
  const groups: HashGroup[] = [];
  for (const [hash, { paragraphs: paras }] of hashGroups) {
    const companies = new Set(paras.map((p) => p.filing.cik));
    const filings = new Set(paras.map((p) => p.filing.accessionNumber));
    const fiscalYears = new Set(paras.map((p) => p.filing.fiscalYear));

    groups.push({
      hash,
      count: paras.length,
      companies,
      filings,
      fiscalYears,
      sampleText: paras[0]!.text.slice(0, 200),
      sampleCompanies: [...new Set(paras.map((p) => p.filing.companyName))].slice(0, 5),
    });
  }

  groups.sort((a, b) => b.count - a.count);

  // ─── Statistics ───

  const totalParagraphs = paragraphs.length;
  const uniqueHashes = groups.length;
  const duplicatedGroups = groups.filter((g) => g.count > 1);
  const duplicatedParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count, 0);
  const excessParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count - 1, 0);

  // Cross-company duplicates (same text in different companies = boilerplate)
  const crossCompanyGroups = groups.filter((g) => g.companies.size > 1);
  const crossCompanyParagraphs = crossCompanyGroups.reduce((sum, g) => sum + g.count, 0);

  // Cross-year duplicates (same company, same text, different fiscal years = copy-paste)
  const crossYearGroups = groups.filter((g) => g.fiscalYears.size > 1);
  const crossYearParagraphs = crossYearGroups.reduce((sum, g) => sum + g.count, 0);

  // Same-company same-year (within-filing duplicates = parser artifact?)
  const sameCompanySameYear = groups.filter(
    (g) => g.count > 1 && g.companies.size === 1 && g.fiscalYears.size === 1,
  );

  // ─── Report ───

  const lines: string[] = [];
  const hr = "═".repeat(72);
  const sr = "─".repeat(72);

  lines.push(hr);
  lines.push("  SEC-cyBERT Deduplication Analysis");
  lines.push(`  Generated: ${new Date().toISOString()}`);
  lines.push(hr);
  lines.push("");

  lines.push("1. OVERVIEW");
  lines.push(sr);
  lines.push(`  Total paragraphs:          ${totalParagraphs.toLocaleString()}`);
  lines.push(`  Unique text hashes:        ${uniqueHashes.toLocaleString()}`);
  lines.push(`  Duplicate groups:          ${duplicatedGroups.length.toLocaleString()}`);
  lines.push(`  Paragraphs in dup groups:  ${duplicatedParagraphs.toLocaleString()} (${round((duplicatedParagraphs / totalParagraphs) * 100)}%)`);
  lines.push(`  Excess (dedup savings):    ${excessParagraphs.toLocaleString()} (${round((excessParagraphs / totalParagraphs) * 100)}%)`);
  lines.push(`  After dedup:               ${(totalParagraphs - excessParagraphs).toLocaleString()} unique paragraphs`);
  lines.push("");

  lines.push("2. DUPLICATE TYPES");
  lines.push(sr);
  lines.push(`  Cross-company (boilerplate templates):  ${crossCompanyGroups.length} groups, ${crossCompanyParagraphs} paragraphs`);
  lines.push(`  Cross-year (copy-paste year-to-year):   ${crossYearGroups.length} groups, ${crossYearParagraphs} paragraphs`);
  lines.push(`  Same-company same-year (parser dupes):  ${sameCompanySameYear.length} groups`);
  lines.push("");

  // Distribution of duplicate group sizes
  const sizeBuckets: Record<string, number> = {
    "2 copies": 0,
    "3-5 copies": 0,
    "6-10 copies": 0,
    "11-20 copies": 0,
    "21-50 copies": 0,
    "50+ copies": 0,
  };
  for (const g of duplicatedGroups) {
    if (g.count === 2) sizeBuckets["2 copies"]!++;
    else if (g.count <= 5) sizeBuckets["3-5 copies"]!++;
    else if (g.count <= 10) sizeBuckets["6-10 copies"]!++;
    else if (g.count <= 20) sizeBuckets["11-20 copies"]!++;
    else if (g.count <= 50) sizeBuckets["21-50 copies"]!++;
    else sizeBuckets["50+ copies"]!++;
  }

  lines.push("3. DUPLICATE GROUP SIZE DISTRIBUTION");
  lines.push(sr);
  for (const [bucket, count] of Object.entries(sizeBuckets)) {
    if (count > 0) {
      lines.push(`    ${bucket.padEnd(20)} ${count} groups`);
    }
  }
  lines.push("");

  // Top cross-company boilerplate
  lines.push("4. TOP CROSS-COMPANY BOILERPLATE (same text, different companies)");
  lines.push(sr);
  for (const g of crossCompanyGroups.slice(0, 15)) {
    lines.push(`  [${g.count} copies across ${g.companies.size} companies]`);
    lines.push(`  Companies: ${g.sampleCompanies.join(", ")}${g.companies.size > 5 ? ` (+${g.companies.size - 5} more)` : ""}`);
    lines.push(`  Text: "${g.sampleText}..."`);
    lines.push("");
  }

  // Top cross-year persistence
  lines.push("5. TOP CROSS-YEAR PERSISTENCE (same text, different fiscal years)");
  lines.push(sr);
  const crossYearSorted = [...crossYearGroups].sort((a, b) => b.count - a.count);
  for (const g of crossYearSorted.slice(0, 10)) {
    lines.push(`  [${g.count} copies, years: ${[...g.fiscalYears].sort().join(", ")}, ${g.companies.size} companies]`);
    lines.push(`  Text: "${g.sampleText}..."`);
    lines.push("");
  }

  // Labeling efficiency
  lines.push("6. LABELING EFFICIENCY");
  lines.push(sr);
  const uniqueToLabel = totalParagraphs - excessParagraphs;
  const labelSavings = excessParagraphs;
  const costPerLabel = 0.001; // rough estimate per paragraph
  lines.push(`  Paragraphs to label (after dedup):  ${uniqueToLabel.toLocaleString()}`);
  lines.push(`  LLM calls saved by dedup:           ${labelSavings.toLocaleString()}`);
  lines.push(`  Estimated cost savings:              $${round(labelSavings * costPerLabel * 3)}`);
  lines.push(`  (3 Stage-1 models × ~$0.001/paragraph)`);
  lines.push("");

  lines.push(hr);
  lines.push("  END OF ANALYSIS");
  lines.push(hr);

  const report = lines.join("\n");

  // Save
  const reportPath = `${DATA}/analysis/dedup-analysis.txt`;
  const jsonPath = `${DATA}/analysis/dedup-analysis.json`;
  await mkdir(dirname(reportPath), { recursive: true });

  const jsonData = {
    generatedAt: new Date().toISOString(),
    totalParagraphs,
    uniqueHashes,
    duplicatedGroups: duplicatedGroups.length,
    excessParagraphs,
    afterDedup: totalParagraphs - excessParagraphs,
    crossCompany: { groups: crossCompanyGroups.length, paragraphs: crossCompanyParagraphs },
    crossYear: { groups: crossYearGroups.length, paragraphs: crossYearParagraphs },
    sameCompanySameYear: sameCompanySameYear.length,
    topBoilerplate: crossCompanyGroups.slice(0, 30).map((g) => ({
      hash: g.hash,
      count: g.count,
      companies: g.companies.size,
      fiscalYears: [...g.fiscalYears].sort(),
      sampleText: g.sampleText,
      sampleCompanies: g.sampleCompanies,
    })),
    topDuplicates: groups.slice(0, 50).map((g) => ({
      hash: g.hash,
      count: g.count,
      companies: g.companies.size,
      filings: g.filings.size,
      fiscalYears: [...g.fiscalYears].sort(),
      sampleText: g.sampleText,
    })),
  };

  await writeFile(jsonPath, JSON.stringify(jsonData, null, 2));
  await writeFile(reportPath, report);

  process.stderr.write(`  Saved: ${reportPath}\n`);
  process.stderr.write(`  Saved: ${jsonPath}\n`);
  console.log(report);
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});