SEC-cyBERT/ts/scripts/dapt-corpus-analytics.ts
2026-03-29 20:33:39 -04:00

204 lines
10 KiB
TypeScript

/**
* DAPT corpus analytics: document length distribution, token estimates,
* quality checks, and filter candidates.
*
* Usage: bun ts/scripts/dapt-corpus-analytics.ts
*
* Input: data/dapt-corpus/shard-*.jsonl
*/
import { readFileSync, readdirSync } from "node:fs";
const CORPUS_DIR = new URL("../../data/dapt-corpus", import.meta.url).pathname;
const CHARS_PER_TOKEN = 4.72; // empirical from ModernBERT tokenizer
interface Doc {
accession: string;
text: string;
}
// ── Load all documents ──────────────────────────────────────────────────
console.error("Loading corpus...");
const shards = readdirSync(CORPUS_DIR)
.filter((f) => f.endsWith(".jsonl"))
.sort();
const docs: { accession: string; chars: number; lines: number; words: number }[] = [];
let totalChars = 0;
for (const shard of shards) {
const path = `${CORPUS_DIR}/${shard}`;
for (const line of readFileSync(path, "utf-8").split("\n")) {
if (!line.trim()) continue;
const doc = JSON.parse(line) as Doc;
const chars = doc.text.length;
const lines = doc.text.split("\n").length;
const words = doc.text.split(/\s+/).filter(Boolean).length;
docs.push({ accession: doc.accession, chars, lines, words });
totalChars += chars;
}
}
console.error(` ${docs.length} documents loaded from ${shards.length} shards\n`);
// ── Basic stats ─────────────────────────────────────────────────────────
const charsSorted = docs.map((d) => d.chars).sort((a, b) => a - b);
const wordsSorted = docs.map((d) => d.words).sort((a, b) => a - b);
function percentile(arr: number[], p: number): number {
const idx = Math.ceil((p / 100) * arr.length) - 1;
return arr[Math.max(0, idx)]!;
}
function mean(arr: number[]): number {
return arr.reduce((a, b) => a + b, 0) / arr.length;
}
const totalTokens = Math.round(totalChars / CHARS_PER_TOKEN);
console.log("═══ DAPT CORPUS ANALYTICS ═══\n");
console.log("─── Overview ───");
console.log(` Documents: ${docs.length.toLocaleString()}`);
console.log(` Shards: ${shards.length}`);
console.log(` Total chars: ${(totalChars / 1e9).toFixed(3)}B`);
console.log(` Total tokens (est): ${(totalTokens / 1e6).toFixed(1)}M (@ ${CHARS_PER_TOKEN} chars/token)`);
console.log("\n─── Document Length Distribution (chars) ───");
console.log(` Min: ${percentile(charsSorted, 0).toLocaleString()}`);
console.log(` P5: ${percentile(charsSorted, 5).toLocaleString()}`);
console.log(` P10: ${percentile(charsSorted, 10).toLocaleString()}`);
console.log(` P25: ${percentile(charsSorted, 25).toLocaleString()}`);
console.log(` Median: ${percentile(charsSorted, 50).toLocaleString()}`);
console.log(` Mean: ${Math.round(mean(charsSorted)).toLocaleString()}`);
console.log(` P75: ${percentile(charsSorted, 75).toLocaleString()}`);
console.log(` P90: ${percentile(charsSorted, 90).toLocaleString()}`);
console.log(` P95: ${percentile(charsSorted, 95).toLocaleString()}`);
console.log(` Max: ${percentile(charsSorted, 100).toLocaleString()}`);
console.log("\n─── Document Length Distribution (words) ───");
console.log(` Min: ${percentile(wordsSorted, 0).toLocaleString()}`);
console.log(` P5: ${percentile(wordsSorted, 5).toLocaleString()}`);
console.log(` Median: ${percentile(wordsSorted, 50).toLocaleString()}`);
console.log(` Mean: ${Math.round(mean(wordsSorted)).toLocaleString()}`);
console.log(` P95: ${percentile(wordsSorted, 95).toLocaleString()}`);
console.log(` Max: ${percentile(wordsSorted, 100).toLocaleString()}`);
// ── Token length distribution ───────────────────────────────────────────
const tokensSorted = docs.map((d) => Math.round(d.chars / CHARS_PER_TOKEN)).sort((a, b) => a - b);
console.log("\n─── Token Length Distribution (estimated) ───");
console.log(` Min: ${percentile(tokensSorted, 0).toLocaleString()}`);
console.log(` P5: ${percentile(tokensSorted, 5).toLocaleString()}`);
console.log(` P10: ${percentile(tokensSorted, 10).toLocaleString()}`);
console.log(` P25: ${percentile(tokensSorted, 25).toLocaleString()}`);
console.log(` Median: ${percentile(tokensSorted, 50).toLocaleString()}`);
console.log(` Mean: ${Math.round(mean(tokensSorted)).toLocaleString()}`);
console.log(` P75: ${percentile(tokensSorted, 75).toLocaleString()}`);
console.log(` P90: ${percentile(tokensSorted, 90).toLocaleString()}`);
console.log(` P95: ${percentile(tokensSorted, 95).toLocaleString()}`);
console.log(` Max: ${percentile(tokensSorted, 100).toLocaleString()}`);
// ── Sequence count at different max_seq_length ──────────────────────────
console.log("\n─── Training Sequences by max_seq_length ───");
for (const seqLen of [512, 1024, 2048, 4096, 8192]) {
let totalSeqs = 0;
for (const d of docs) {
const tokens = Math.round(d.chars / CHARS_PER_TOKEN);
totalSeqs += Math.ceil(tokens / seqLen);
}
const docsExceeding = docs.filter((d) => Math.round(d.chars / CHARS_PER_TOKEN) > seqLen).length;
console.log(
` ${String(seqLen).padStart(5)}: ${totalSeqs.toLocaleString().padStart(10)} sequences` +
` (${docsExceeding.toLocaleString()} docs exceed, ${((docsExceeding / docs.length) * 100).toFixed(1)}%)`,
);
}
// ── Filter candidates ───────────────────────────────────────────────────
const tiny = docs.filter((d) => d.chars < 10_000);
const small = docs.filter((d) => d.chars < 50_000);
const empty = docs.filter((d) => d.chars < 100);
const huge = docs.filter((d) => d.chars > 5_000_000);
console.log("\n─── Filter Candidates ───");
console.log(` <100 chars (empty): ${empty.length}`);
console.log(` <10K chars (covers): ${tiny.length} (${(tiny.reduce((s, d) => s + d.chars, 0) / totalChars * 100).toFixed(3)}% of corpus)`);
console.log(` <50K chars (small): ${small.length} (${(small.reduce((s, d) => s + d.chars, 0) / totalChars * 100).toFixed(3)}% of corpus)`);
console.log(` >5M chars (huge): ${huge.length}`);
if (tiny.length > 0 && tiny.length <= 20) {
console.log("\n Tiny documents (<10K chars):");
for (const d of tiny.sort((a, b) => a.chars - b.chars)) {
console.log(` ${d.accession}: ${d.chars.toLocaleString()} chars, ${d.words.toLocaleString()} words`);
}
}
// ── Content quality spot checks ─────────────────────────────────────────
console.log("\n─── Content Quality Checks ───");
// Check for residual HTML tags
let docsWithHtml = 0;
let docsWithXbrl = 0;
let docsWithPageNums = 0;
let docsWithUrls = 0;
let singleBlockDocs = 0;
for (const shard of shards) {
const path = `${CORPUS_DIR}/${shard}`;
for (const line of readFileSync(path, "utf-8").split("\n")) {
if (!line.trim()) continue;
const doc = JSON.parse(line) as Doc;
if (/<[a-z][^>]*>/i.test(doc.text)) docsWithHtml++;
if (/ix:|xbrl|xmlns/i.test(doc.text)) docsWithXbrl++;
if (/\n\s*(?:\d{1,3}|[- ]\d{1,3}[- ]|F-\d+)\s*\n/.test(doc.text)) docsWithPageNums++;
if (/https?:\/\//.test(doc.text)) docsWithUrls++;
if (doc.text.split("\n\n").length < 3) singleBlockDocs++;
}
}
console.log(` Residual HTML tags: ${docsWithHtml} docs (${((docsWithHtml / docs.length) * 100).toFixed(1)}%)`);
console.log(` XBRL/xmlns traces: ${docsWithXbrl} docs (${((docsWithXbrl / docs.length) * 100).toFixed(1)}%)`);
console.log(` Page number traces: ${docsWithPageNums} docs (${((docsWithPageNums / docs.length) * 100).toFixed(1)}%)`);
console.log(` URLs present: ${docsWithUrls} docs (${((docsWithUrls / docs.length) * 100).toFixed(1)}%)`);
console.log(` Single-block (<3¶): ${singleBlockDocs} docs`);
// ── Shard distribution ──────────────────────────────────────────────────
console.log("\n─── Shard Distribution ───");
let shardIdx = 0;
for (const shard of shards) {
const path = `${CORPUS_DIR}/${shard}`;
const lines = readFileSync(path, "utf-8").split("\n").filter((l) => l.trim()).length;
const sizeBytes = readFileSync(path).length;
console.log(
` ${shard}: ${lines.toLocaleString().padStart(6)} docs, ${(sizeBytes / 1e6).toFixed(0).padStart(4)} MB`,
);
shardIdx++;
}
// ── Post-filter stats ───────────────────────────────────────────────────
const filtered = docs.filter((d) => d.chars >= 10_000);
const filteredChars = filtered.reduce((s, d) => s + d.chars, 0);
const filteredTokens = Math.round(filteredChars / CHARS_PER_TOKEN);
console.log("\n─── After Filtering <10K chars ───");
console.log(` Documents: ${filtered.length.toLocaleString()} (removed ${docs.length - filtered.length})`);
console.log(` Total chars: ${(filteredChars / 1e9).toFixed(3)}B`);
console.log(` Total tokens (est): ${(filteredTokens / 1e6).toFixed(1)}M`);
console.log(` Token loss: ${((1 - filteredTokens / totalTokens) * 100).toFixed(3)}%`);
// ── Training time estimates ─────────────────────────────────────────────
console.log("\n─── Training Time Estimates (RTX 3090, bf16, grad_checkpoint) ───");
for (const { seqLen, batchSize, gradAccum, secPerStepRange } of [
{ seqLen: 2048, batchSize: 4, gradAccum: 8, secPerStepRange: [1.0, 1.5, 2.0] },
{ seqLen: 8192, batchSize: 1, gradAccum: 32, secPerStepRange: [3.0, 5.0, 7.0] },
]) {
const totalSeqs = filtered.reduce((s, d) => s + Math.ceil(Math.round(d.chars / CHARS_PER_TOKEN) / seqLen), 0);
const effectiveBatch = batchSize * gradAccum;
const stepsPerEpoch = Math.ceil(totalSeqs / effectiveBatch);
console.log(`\n seq_len=${seqLen}, batch=${batchSize}, grad_accum=${gradAccum} (eff=${effectiveBatch})`);
console.log(` Sequences: ${totalSeqs.toLocaleString()}, Steps/epoch: ${stepsPerEpoch.toLocaleString()}`);
for (const secPerStep of secPerStepRange) {
const hoursPerEpoch = (stepsPerEpoch * secPerStep) / 3600;
console.log(` @ ${secPerStep}s/step: ${hoursPerEpoch.toFixed(1)}h/epoch`);
}
}