/** * Deduplication analysis: hash-based detection of identical/near-identical paragraphs. * Tracks cross-filing and cross-year persistence of boilerplate text. */ import { readJsonl } from "../lib/jsonl.ts"; import { Paragraph } from "@sec-cybert/schemas/paragraph.ts"; import type { Paragraph as ParagraphType } from "@sec-cybert/schemas/paragraph.ts"; import { writeFile, mkdir } from "node:fs/promises"; import { dirname } from "node:path"; const DATA = "../data"; const PARAGRAPHS_PATH = `${DATA}/paragraphs/paragraphs.jsonl`; function round(n: number, d = 2): number { return Math.round(n * 10 ** d) / 10 ** d; } interface HashGroup { hash: string; count: number; companies: Set; filings: Set; fiscalYears: Set; sampleText: string; sampleCompanies: string[]; } async function main() { process.stderr.write(" Loading paragraphs...\n"); const { records: paragraphs, skipped } = await readJsonl(PARAGRAPHS_PATH, Paragraph); if (skipped > 0) process.stderr.write(` Warning: ${skipped} invalid lines skipped\n`); process.stderr.write(` Loaded ${paragraphs.length} paragraphs\n\n`); // Group by textHash const hashGroups = new Map(); for (const p of paragraphs) { const existing = hashGroups.get(p.textHash) ?? { paragraphs: [] }; existing.paragraphs.push(p); hashGroups.set(p.textHash, existing); } // Analyze each group const groups: HashGroup[] = []; for (const [hash, { paragraphs: paras }] of hashGroups) { const companies = new Set(paras.map((p) => p.filing.cik)); const filings = new Set(paras.map((p) => p.filing.accessionNumber)); const fiscalYears = new Set(paras.map((p) => p.filing.fiscalYear)); groups.push({ hash, count: paras.length, companies, filings, fiscalYears, sampleText: paras[0]!.text.slice(0, 200), sampleCompanies: [...new Set(paras.map((p) => p.filing.companyName))].slice(0, 5), }); } groups.sort((a, b) => b.count - a.count); // ─── Statistics ─── const totalParagraphs = paragraphs.length; const uniqueHashes = groups.length; const duplicatedGroups = groups.filter((g) => g.count > 1); const duplicatedParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count, 0); const excessParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count - 1, 0); // Cross-company duplicates (same text in different companies = boilerplate) const crossCompanyGroups = groups.filter((g) => g.companies.size > 1); const crossCompanyParagraphs = crossCompanyGroups.reduce((sum, g) => sum + g.count, 0); // Cross-year duplicates (same company, same text, different fiscal years = copy-paste) const crossYearGroups = groups.filter((g) => g.fiscalYears.size > 1); const crossYearParagraphs = crossYearGroups.reduce((sum, g) => sum + g.count, 0); // Same-company same-year (within-filing duplicates = parser artifact?) const sameCompanySameYear = groups.filter( (g) => g.count > 1 && g.companies.size === 1 && g.fiscalYears.size === 1, ); // ─── Report ─── const lines: string[] = []; const hr = "═".repeat(72); const sr = "─".repeat(72); lines.push(hr); lines.push(" SEC-cyBERT Deduplication Analysis"); lines.push(` Generated: ${new Date().toISOString()}`); lines.push(hr); lines.push(""); lines.push("1. OVERVIEW"); lines.push(sr); lines.push(` Total paragraphs: ${totalParagraphs.toLocaleString()}`); lines.push(` Unique text hashes: ${uniqueHashes.toLocaleString()}`); lines.push(` Duplicate groups: ${duplicatedGroups.length.toLocaleString()}`); lines.push(` Paragraphs in dup groups: ${duplicatedParagraphs.toLocaleString()} (${round((duplicatedParagraphs / totalParagraphs) * 100)}%)`); lines.push(` Excess (dedup savings): ${excessParagraphs.toLocaleString()} (${round((excessParagraphs / totalParagraphs) * 100)}%)`); lines.push(` After dedup: ${(totalParagraphs - excessParagraphs).toLocaleString()} unique paragraphs`); lines.push(""); lines.push("2. DUPLICATE TYPES"); lines.push(sr); lines.push(` Cross-company (boilerplate templates): ${crossCompanyGroups.length} groups, ${crossCompanyParagraphs} paragraphs`); lines.push(` Cross-year (copy-paste year-to-year): ${crossYearGroups.length} groups, ${crossYearParagraphs} paragraphs`); lines.push(` Same-company same-year (parser dupes): ${sameCompanySameYear.length} groups`); lines.push(""); // Distribution of duplicate group sizes const sizeBuckets: Record = { "2 copies": 0, "3-5 copies": 0, "6-10 copies": 0, "11-20 copies": 0, "21-50 copies": 0, "50+ copies": 0, }; for (const g of duplicatedGroups) { if (g.count === 2) sizeBuckets["2 copies"]!++; else if (g.count <= 5) sizeBuckets["3-5 copies"]!++; else if (g.count <= 10) sizeBuckets["6-10 copies"]!++; else if (g.count <= 20) sizeBuckets["11-20 copies"]!++; else if (g.count <= 50) sizeBuckets["21-50 copies"]!++; else sizeBuckets["50+ copies"]!++; } lines.push("3. DUPLICATE GROUP SIZE DISTRIBUTION"); lines.push(sr); for (const [bucket, count] of Object.entries(sizeBuckets)) { if (count > 0) { lines.push(` ${bucket.padEnd(20)} ${count} groups`); } } lines.push(""); // Top cross-company boilerplate lines.push("4. TOP CROSS-COMPANY BOILERPLATE (same text, different companies)"); lines.push(sr); for (const g of crossCompanyGroups.slice(0, 15)) { lines.push(` [${g.count} copies across ${g.companies.size} companies]`); lines.push(` Companies: ${g.sampleCompanies.join(", ")}${g.companies.size > 5 ? ` (+${g.companies.size - 5} more)` : ""}`); lines.push(` Text: "${g.sampleText}..."`); lines.push(""); } // Top cross-year persistence lines.push("5. TOP CROSS-YEAR PERSISTENCE (same text, different fiscal years)"); lines.push(sr); const crossYearSorted = [...crossYearGroups].sort((a, b) => b.count - a.count); for (const g of crossYearSorted.slice(0, 10)) { lines.push(` [${g.count} copies, years: ${[...g.fiscalYears].sort().join(", ")}, ${g.companies.size} companies]`); lines.push(` Text: "${g.sampleText}..."`); lines.push(""); } // Labeling efficiency lines.push("6. LABELING EFFICIENCY"); lines.push(sr); const uniqueToLabel = totalParagraphs - excessParagraphs; const labelSavings = excessParagraphs; const costPerLabel = 0.001; // rough estimate per paragraph lines.push(` Paragraphs to label (after dedup): ${uniqueToLabel.toLocaleString()}`); lines.push(` LLM calls saved by dedup: ${labelSavings.toLocaleString()}`); lines.push(` Estimated cost savings: $${round(labelSavings * costPerLabel * 3)}`); lines.push(` (3 Stage-1 models × ~$0.001/paragraph)`); lines.push(""); lines.push(hr); lines.push(" END OF ANALYSIS"); lines.push(hr); const report = lines.join("\n"); // Save const reportPath = `${DATA}/analysis/dedup-analysis.txt`; const jsonPath = `${DATA}/analysis/dedup-analysis.json`; await mkdir(dirname(reportPath), { recursive: true }); const jsonData = { generatedAt: new Date().toISOString(), totalParagraphs, uniqueHashes, duplicatedGroups: duplicatedGroups.length, excessParagraphs, afterDedup: totalParagraphs - excessParagraphs, crossCompany: { groups: crossCompanyGroups.length, paragraphs: crossCompanyParagraphs }, crossYear: { groups: crossYearGroups.length, paragraphs: crossYearParagraphs }, sameCompanySameYear: sameCompanySameYear.length, topBoilerplate: crossCompanyGroups.slice(0, 30).map((g) => ({ hash: g.hash, count: g.count, companies: g.companies.size, fiscalYears: [...g.fiscalYears].sort(), sampleText: g.sampleText, sampleCompanies: g.sampleCompanies, })), topDuplicates: groups.slice(0, 50).map((g) => ({ hash: g.hash, count: g.count, companies: g.companies.size, filings: g.filings.size, fiscalYears: [...g.fiscalYears].sort(), sampleText: g.sampleText, })), }; await writeFile(jsonPath, JSON.stringify(jsonData, null, 2)); await writeFile(reportPath, report); process.stderr.write(` Saved: ${reportPath}\n`); process.stderr.write(` Saved: ${jsonPath}\n`); console.log(report); } main().catch((err) => { console.error(err); process.exit(1); });