222 lines
8.3 KiB
TypeScript
222 lines
8.3 KiB
TypeScript
/**
|
||
* Deduplication analysis: hash-based detection of identical/near-identical paragraphs.
|
||
* Tracks cross-filing and cross-year persistence of boilerplate text.
|
||
*/
|
||
import { readJsonl } from "../lib/jsonl.ts";
|
||
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
|
||
import type { Paragraph as ParagraphType } from "@sec-cybert/schemas/paragraph.ts";
|
||
import { writeFile, mkdir } from "node:fs/promises";
|
||
import { dirname } from "node:path";
|
||
|
||
const DATA = "../data";
|
||
const PARAGRAPHS_PATH = `${DATA}/paragraphs/paragraphs.jsonl`;
|
||
|
||
function round(n: number, d = 2): number {
|
||
return Math.round(n * 10 ** d) / 10 ** d;
|
||
}
|
||
|
||
interface HashGroup {
|
||
hash: string;
|
||
count: number;
|
||
companies: Set<string>;
|
||
filings: Set<string>;
|
||
fiscalYears: Set<number>;
|
||
sampleText: string;
|
||
sampleCompanies: string[];
|
||
}
|
||
|
||
async function main() {
|
||
process.stderr.write(" Loading paragraphs...\n");
|
||
const { records: paragraphs, skipped } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
|
||
if (skipped > 0) process.stderr.write(` Warning: ${skipped} invalid lines skipped\n`);
|
||
process.stderr.write(` Loaded ${paragraphs.length} paragraphs\n\n`);
|
||
|
||
// Group by textHash
|
||
const hashGroups = new Map<string, { paragraphs: ParagraphType[] }>();
|
||
for (const p of paragraphs) {
|
||
const existing = hashGroups.get(p.textHash) ?? { paragraphs: [] };
|
||
existing.paragraphs.push(p);
|
||
hashGroups.set(p.textHash, existing);
|
||
}
|
||
|
||
// Analyze each group
|
||
const groups: HashGroup[] = [];
|
||
for (const [hash, { paragraphs: paras }] of hashGroups) {
|
||
const companies = new Set(paras.map((p) => p.filing.cik));
|
||
const filings = new Set(paras.map((p) => p.filing.accessionNumber));
|
||
const fiscalYears = new Set(paras.map((p) => p.filing.fiscalYear));
|
||
|
||
groups.push({
|
||
hash,
|
||
count: paras.length,
|
||
companies,
|
||
filings,
|
||
fiscalYears,
|
||
sampleText: paras[0]!.text.slice(0, 200),
|
||
sampleCompanies: [...new Set(paras.map((p) => p.filing.companyName))].slice(0, 5),
|
||
});
|
||
}
|
||
|
||
groups.sort((a, b) => b.count - a.count);
|
||
|
||
// ─── Statistics ───
|
||
|
||
const totalParagraphs = paragraphs.length;
|
||
const uniqueHashes = groups.length;
|
||
const duplicatedGroups = groups.filter((g) => g.count > 1);
|
||
const duplicatedParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count, 0);
|
||
const excessParagraphs = duplicatedGroups.reduce((sum, g) => sum + g.count - 1, 0);
|
||
|
||
// Cross-company duplicates (same text in different companies = boilerplate)
|
||
const crossCompanyGroups = groups.filter((g) => g.companies.size > 1);
|
||
const crossCompanyParagraphs = crossCompanyGroups.reduce((sum, g) => sum + g.count, 0);
|
||
|
||
// Cross-year duplicates (same company, same text, different fiscal years = copy-paste)
|
||
const crossYearGroups = groups.filter((g) => g.fiscalYears.size > 1);
|
||
const crossYearParagraphs = crossYearGroups.reduce((sum, g) => sum + g.count, 0);
|
||
|
||
// Same-company same-year (within-filing duplicates = parser artifact?)
|
||
const sameCompanySameYear = groups.filter(
|
||
(g) => g.count > 1 && g.companies.size === 1 && g.fiscalYears.size === 1,
|
||
);
|
||
|
||
// ─── Report ───
|
||
|
||
const lines: string[] = [];
|
||
const hr = "═".repeat(72);
|
||
const sr = "─".repeat(72);
|
||
|
||
lines.push(hr);
|
||
lines.push(" SEC-cyBERT Deduplication Analysis");
|
||
lines.push(` Generated: ${new Date().toISOString()}`);
|
||
lines.push(hr);
|
||
lines.push("");
|
||
|
||
lines.push("1. OVERVIEW");
|
||
lines.push(sr);
|
||
lines.push(` Total paragraphs: ${totalParagraphs.toLocaleString()}`);
|
||
lines.push(` Unique text hashes: ${uniqueHashes.toLocaleString()}`);
|
||
lines.push(` Duplicate groups: ${duplicatedGroups.length.toLocaleString()}`);
|
||
lines.push(` Paragraphs in dup groups: ${duplicatedParagraphs.toLocaleString()} (${round((duplicatedParagraphs / totalParagraphs) * 100)}%)`);
|
||
lines.push(` Excess (dedup savings): ${excessParagraphs.toLocaleString()} (${round((excessParagraphs / totalParagraphs) * 100)}%)`);
|
||
lines.push(` After dedup: ${(totalParagraphs - excessParagraphs).toLocaleString()} unique paragraphs`);
|
||
lines.push("");
|
||
|
||
lines.push("2. DUPLICATE TYPES");
|
||
lines.push(sr);
|
||
lines.push(` Cross-company (boilerplate templates): ${crossCompanyGroups.length} groups, ${crossCompanyParagraphs} paragraphs`);
|
||
lines.push(` Cross-year (copy-paste year-to-year): ${crossYearGroups.length} groups, ${crossYearParagraphs} paragraphs`);
|
||
lines.push(` Same-company same-year (parser dupes): ${sameCompanySameYear.length} groups`);
|
||
lines.push("");
|
||
|
||
// Distribution of duplicate group sizes
|
||
const sizeBuckets: Record<string, number> = {
|
||
"2 copies": 0,
|
||
"3-5 copies": 0,
|
||
"6-10 copies": 0,
|
||
"11-20 copies": 0,
|
||
"21-50 copies": 0,
|
||
"50+ copies": 0,
|
||
};
|
||
for (const g of duplicatedGroups) {
|
||
if (g.count === 2) sizeBuckets["2 copies"]!++;
|
||
else if (g.count <= 5) sizeBuckets["3-5 copies"]!++;
|
||
else if (g.count <= 10) sizeBuckets["6-10 copies"]!++;
|
||
else if (g.count <= 20) sizeBuckets["11-20 copies"]!++;
|
||
else if (g.count <= 50) sizeBuckets["21-50 copies"]!++;
|
||
else sizeBuckets["50+ copies"]!++;
|
||
}
|
||
|
||
lines.push("3. DUPLICATE GROUP SIZE DISTRIBUTION");
|
||
lines.push(sr);
|
||
for (const [bucket, count] of Object.entries(sizeBuckets)) {
|
||
if (count > 0) {
|
||
lines.push(` ${bucket.padEnd(20)} ${count} groups`);
|
||
}
|
||
}
|
||
lines.push("");
|
||
|
||
// Top cross-company boilerplate
|
||
lines.push("4. TOP CROSS-COMPANY BOILERPLATE (same text, different companies)");
|
||
lines.push(sr);
|
||
for (const g of crossCompanyGroups.slice(0, 15)) {
|
||
lines.push(` [${g.count} copies across ${g.companies.size} companies]`);
|
||
lines.push(` Companies: ${g.sampleCompanies.join(", ")}${g.companies.size > 5 ? ` (+${g.companies.size - 5} more)` : ""}`);
|
||
lines.push(` Text: "${g.sampleText}..."`);
|
||
lines.push("");
|
||
}
|
||
|
||
// Top cross-year persistence
|
||
lines.push("5. TOP CROSS-YEAR PERSISTENCE (same text, different fiscal years)");
|
||
lines.push(sr);
|
||
const crossYearSorted = [...crossYearGroups].sort((a, b) => b.count - a.count);
|
||
for (const g of crossYearSorted.slice(0, 10)) {
|
||
lines.push(` [${g.count} copies, years: ${[...g.fiscalYears].sort().join(", ")}, ${g.companies.size} companies]`);
|
||
lines.push(` Text: "${g.sampleText}..."`);
|
||
lines.push("");
|
||
}
|
||
|
||
// Labeling efficiency
|
||
lines.push("6. LABELING EFFICIENCY");
|
||
lines.push(sr);
|
||
const uniqueToLabel = totalParagraphs - excessParagraphs;
|
||
const labelSavings = excessParagraphs;
|
||
const costPerLabel = 0.001; // rough estimate per paragraph
|
||
lines.push(` Paragraphs to label (after dedup): ${uniqueToLabel.toLocaleString()}`);
|
||
lines.push(` LLM calls saved by dedup: ${labelSavings.toLocaleString()}`);
|
||
lines.push(` Estimated cost savings: $${round(labelSavings * costPerLabel * 3)}`);
|
||
lines.push(` (3 Stage-1 models × ~$0.001/paragraph)`);
|
||
lines.push("");
|
||
|
||
lines.push(hr);
|
||
lines.push(" END OF ANALYSIS");
|
||
lines.push(hr);
|
||
|
||
const report = lines.join("\n");
|
||
|
||
// Save
|
||
const reportPath = `${DATA}/analysis/dedup-analysis.txt`;
|
||
const jsonPath = `${DATA}/analysis/dedup-analysis.json`;
|
||
await mkdir(dirname(reportPath), { recursive: true });
|
||
|
||
const jsonData = {
|
||
generatedAt: new Date().toISOString(),
|
||
totalParagraphs,
|
||
uniqueHashes,
|
||
duplicatedGroups: duplicatedGroups.length,
|
||
excessParagraphs,
|
||
afterDedup: totalParagraphs - excessParagraphs,
|
||
crossCompany: { groups: crossCompanyGroups.length, paragraphs: crossCompanyParagraphs },
|
||
crossYear: { groups: crossYearGroups.length, paragraphs: crossYearParagraphs },
|
||
sameCompanySameYear: sameCompanySameYear.length,
|
||
topBoilerplate: crossCompanyGroups.slice(0, 30).map((g) => ({
|
||
hash: g.hash,
|
||
count: g.count,
|
||
companies: g.companies.size,
|
||
fiscalYears: [...g.fiscalYears].sort(),
|
||
sampleText: g.sampleText,
|
||
sampleCompanies: g.sampleCompanies,
|
||
})),
|
||
topDuplicates: groups.slice(0, 50).map((g) => ({
|
||
hash: g.hash,
|
||
count: g.count,
|
||
companies: g.companies.size,
|
||
filings: g.filings.size,
|
||
fiscalYears: [...g.fiscalYears].sort(),
|
||
sampleText: g.sampleText,
|
||
})),
|
||
};
|
||
|
||
await writeFile(jsonPath, JSON.stringify(jsonData, null, 2));
|
||
await writeFile(reportPath, report);
|
||
|
||
process.stderr.write(` Saved: ${reportPath}\n`);
|
||
process.stderr.write(` Saved: ${jsonPath}\n`);
|
||
console.log(report);
|
||
}
|
||
|
||
main().catch((err) => {
|
||
console.error(err);
|
||
process.exit(1);
|
||
});
|