SEC-cyBERT/ts/src/analyze/dedup-analysis.ts
2026-03-28 23:44:37 -04:00

222 lines
8.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Deduplication analysis: hash-based detection of identical/near-identical paragraphs.
* Tracks cross-filing and cross-year persistence of boilerplate text.
*/
import { readJsonl } from "../lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
import type { Paragraph as ParagraphType } from "@sec-cybert/schemas/paragraph.ts";
import { writeFile, mkdir } from "node:fs/promises";
import { dirname } from "node:path";
const DATA = "../data";
const PARAGRAPHS_PATH = `${DATA}/paragraphs/paragraphs.jsonl`;
function round(n: number, d = 2): number {
return Math.round(n * 10 ** d) / 10 ** d;
}
interface HashGroup {
hash: string;
count: number;
companies: Set<string>;
filings: Set<string>;
fiscalYears: Set<number>;
sampleText: string;
sampleCompanies: string[];
}
async function main() {
process.stderr.write(" Loading paragraphs...\n");
const { records: paragraphs, skipped } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
if (skipped > 0) process.stderr.write(` Warning: ${skipped} invalid lines skipped\n`);
process.stderr.write(` Loaded ${paragraphs.length} paragraphs\n\n`);
// Group by textHash
const hashGroups = new Map<string, { paragraphs: ParagraphType[] }>();
for (const p of paragraphs) {
const existing = hashGroups.get(p.textHash) ?? { paragraphs: [] };
existing.paragraphs.push(p);
hashGroups.set(p.textHash, existing);
}
// Analyze each group
const groups: HashGroup[] = [];
for (const [hash, { paragraphs: paras }] of hashGroups) {
const companies = new Set(paras.map((p) => p.filing.cik));
const filings = new Set(paras.map((p) => p.filing.accessionNumber));
const fiscalYears = new Set(paras.map((p) => p.filing.fiscalYear));
groups.push({
hash,
count: paras.length,
companies,
filings,
fiscalYears,
sampleText: paras[0]!.text.slice(0, 200),
sampleCompanies: [...new Set(paras.map((p) => p.filing.companyName))].slice(0, 5),
});
}
groups.sort((a, b) => b.count - a.count);
// ─── Statistics ───
const totalParagraphs = paragraphs.length;
const uniqueHashes = groups.length;
const duplicatedGroups = groups.filter((g) => g.count > 1);
const duplicatedParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count, 0);
const excessParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count - 1, 0);
// Cross-company duplicates (same text in different companies = boilerplate)
const crossCompanyGroups = groups.filter((g) => g.companies.size > 1);
const crossCompanyParagraphs = crossCompanyGroups.reduce((sum, g) => sum + g.count, 0);
// Cross-year duplicates (same company, same text, different fiscal years = copy-paste)
const crossYearGroups = groups.filter((g) => g.fiscalYears.size > 1);
const crossYearParagraphs = crossYearGroups.reduce((sum, g) => sum + g.count, 0);
// Same-company same-year (within-filing duplicates = parser artifact?)
const sameCompanySameYear = groups.filter(
(g) => g.count > 1 && g.companies.size === 1 && g.fiscalYears.size === 1,
);
// ─── Report ───
const lines: string[] = [];
const hr = "═".repeat(72);
const sr = "─".repeat(72);
lines.push(hr);
lines.push(" SEC-cyBERT Deduplication Analysis");
lines.push(` Generated: ${new Date().toISOString()}`);
lines.push(hr);
lines.push("");
lines.push("1. OVERVIEW");
lines.push(sr);
lines.push(` Total paragraphs: ${totalParagraphs.toLocaleString()}`);
lines.push(` Unique text hashes: ${uniqueHashes.toLocaleString()}`);
lines.push(` Duplicate groups: ${duplicatedGroups.length.toLocaleString()}`);
lines.push(` Paragraphs in dup groups: ${duplicatedParagraphs.toLocaleString()} (${round((duplicatedParagraphs / totalParagraphs) * 100)}%)`);
lines.push(` Excess (dedup savings): ${excessParagraphs.toLocaleString()} (${round((excessParagraphs / totalParagraphs) * 100)}%)`);
lines.push(` After dedup: ${(totalParagraphs - excessParagraphs).toLocaleString()} unique paragraphs`);
lines.push("");
lines.push("2. DUPLICATE TYPES");
lines.push(sr);
lines.push(` Cross-company (boilerplate templates): ${crossCompanyGroups.length} groups, ${crossCompanyParagraphs} paragraphs`);
lines.push(` Cross-year (copy-paste year-to-year): ${crossYearGroups.length} groups, ${crossYearParagraphs} paragraphs`);
lines.push(` Same-company same-year (parser dupes): ${sameCompanySameYear.length} groups`);
lines.push("");
// Distribution of duplicate group sizes
const sizeBuckets: Record<string, number> = {
"2 copies": 0,
"3-5 copies": 0,
"6-10 copies": 0,
"11-20 copies": 0,
"21-50 copies": 0,
"50+ copies": 0,
};
for (const g of duplicatedGroups) {
if (g.count === 2) sizeBuckets["2 copies"]!++;
else if (g.count <= 5) sizeBuckets["3-5 copies"]!++;
else if (g.count <= 10) sizeBuckets["6-10 copies"]!++;
else if (g.count <= 20) sizeBuckets["11-20 copies"]!++;
else if (g.count <= 50) sizeBuckets["21-50 copies"]!++;
else sizeBuckets["50+ copies"]!++;
}
lines.push("3. DUPLICATE GROUP SIZE DISTRIBUTION");
lines.push(sr);
for (const [bucket, count] of Object.entries(sizeBuckets)) {
if (count > 0) {
lines.push(` ${bucket.padEnd(20)} ${count} groups`);
}
}
lines.push("");
// Top cross-company boilerplate
lines.push("4. TOP CROSS-COMPANY BOILERPLATE (same text, different companies)");
lines.push(sr);
for (const g of crossCompanyGroups.slice(0, 15)) {
lines.push(` [${g.count} copies across ${g.companies.size} companies]`);
lines.push(` Companies: ${g.sampleCompanies.join(", ")}${g.companies.size > 5 ? ` (+${g.companies.size - 5} more)` : ""}`);
lines.push(` Text: "${g.sampleText}..."`);
lines.push("");
}
// Top cross-year persistence
lines.push("5. TOP CROSS-YEAR PERSISTENCE (same text, different fiscal years)");
lines.push(sr);
const crossYearSorted = [...crossYearGroups].sort((a, b) => b.count - a.count);
for (const g of crossYearSorted.slice(0, 10)) {
lines.push(` [${g.count} copies, years: ${[...g.fiscalYears].sort().join(", ")}, ${g.companies.size} companies]`);
lines.push(` Text: "${g.sampleText}..."`);
lines.push("");
}
// Labeling efficiency
lines.push("6. LABELING EFFICIENCY");
lines.push(sr);
const uniqueToLabel = totalParagraphs - excessParagraphs;
const labelSavings = excessParagraphs;
const costPerLabel = 0.001; // rough estimate per paragraph
lines.push(` Paragraphs to label (after dedup): ${uniqueToLabel.toLocaleString()}`);
lines.push(` LLM calls saved by dedup: ${labelSavings.toLocaleString()}`);
lines.push(` Estimated cost savings: $${round(labelSavings * costPerLabel * 3)}`);
lines.push(` (3 Stage-1 models × ~$0.001/paragraph)`);
lines.push("");
lines.push(hr);
lines.push(" END OF ANALYSIS");
lines.push(hr);
const report = lines.join("\n");
// Save
const reportPath = `${DATA}/analysis/dedup-analysis.txt`;
const jsonPath = `${DATA}/analysis/dedup-analysis.json`;
await mkdir(dirname(reportPath), { recursive: true });
const jsonData = {
generatedAt: new Date().toISOString(),
totalParagraphs,
uniqueHashes,
duplicatedGroups: duplicatedGroups.length,
excessParagraphs,
afterDedup: totalParagraphs - excessParagraphs,
crossCompany: { groups: crossCompanyGroups.length, paragraphs: crossCompanyParagraphs },
crossYear: { groups: crossYearGroups.length, paragraphs: crossYearParagraphs },
sameCompanySameYear: sameCompanySameYear.length,
topBoilerplate: crossCompanyGroups.slice(0, 30).map((g) => ({
hash: g.hash,
count: g.count,
companies: g.companies.size,
fiscalYears: [...g.fiscalYears].sort(),
sampleText: g.sampleText,
sampleCompanies: g.sampleCompanies,
})),
topDuplicates: groups.slice(0, 50).map((g) => ({
hash: g.hash,
count: g.count,
companies: g.companies.size,
filings: g.filings.size,
fiscalYears: [...g.fiscalYears].sort(),
sampleText: g.sampleText,
})),
};
await writeFile(jsonPath, JSON.stringify(jsonData, null, 2));
await writeFile(reportPath, report);
process.stderr.write(` Saved: ${reportPath}\n`);
process.stderr.write(` Saved: ${jsonPath}\n`);
console.log(report);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});