/** * DAPT corpus analytics: document length distribution, token estimates, * quality checks, and filter candidates. * * Usage: bun ts/scripts/dapt-corpus-analytics.ts * * Input: data/dapt-corpus/shard-*.jsonl */ import { readFileSync, readdirSync } from "node:fs"; const CORPUS_DIR = new URL("../../data/dapt-corpus", import.meta.url).pathname; const CHARS_PER_TOKEN = 4.72; // empirical from ModernBERT tokenizer interface Doc { accession: string; text: string; } // ── Load all documents ────────────────────────────────────────────────── console.error("Loading corpus..."); const shards = readdirSync(CORPUS_DIR) .filter((f) => f.endsWith(".jsonl")) .sort(); const docs: { accession: string; chars: number; lines: number; words: number }[] = []; let totalChars = 0; for (const shard of shards) { const path = `${CORPUS_DIR}/${shard}`; for (const line of readFileSync(path, "utf-8").split("\n")) { if (!line.trim()) continue; const doc = JSON.parse(line) as Doc; const chars = doc.text.length; const lines = doc.text.split("\n").length; const words = doc.text.split(/\s+/).filter(Boolean).length; docs.push({ accession: doc.accession, chars, lines, words }); totalChars += chars; } } console.error(` ${docs.length} documents loaded from ${shards.length} shards\n`); // ── Basic stats ───────────────────────────────────────────────────────── const charsSorted = docs.map((d) => d.chars).sort((a, b) => a - b); const wordsSorted = docs.map((d) => d.words).sort((a, b) => a - b); function percentile(arr: number[], p: number): number { const idx = Math.ceil((p / 100) * arr.length) - 1; return arr[Math.max(0, idx)]!; } function mean(arr: number[]): number { return arr.reduce((a, b) => a + b, 0) / arr.length; } const totalTokens = Math.round(totalChars / CHARS_PER_TOKEN); console.log("═══ DAPT CORPUS ANALYTICS ═══\n"); console.log("─── Overview ───"); console.log(` Documents: ${docs.length.toLocaleString()}`); console.log(` Shards: ${shards.length}`); console.log(` Total chars: ${(totalChars / 1e9).toFixed(3)}B`); console.log(` Total tokens (est): ${(totalTokens / 1e6).toFixed(1)}M (@ ${CHARS_PER_TOKEN} chars/token)`); console.log("\n─── Document Length Distribution (chars) ───"); console.log(` Min: ${percentile(charsSorted, 0).toLocaleString()}`); console.log(` P5: ${percentile(charsSorted, 5).toLocaleString()}`); console.log(` P10: ${percentile(charsSorted, 10).toLocaleString()}`); console.log(` P25: ${percentile(charsSorted, 25).toLocaleString()}`); console.log(` Median: ${percentile(charsSorted, 50).toLocaleString()}`); console.log(` Mean: ${Math.round(mean(charsSorted)).toLocaleString()}`); console.log(` P75: ${percentile(charsSorted, 75).toLocaleString()}`); console.log(` P90: ${percentile(charsSorted, 90).toLocaleString()}`); console.log(` P95: ${percentile(charsSorted, 95).toLocaleString()}`); console.log(` Max: ${percentile(charsSorted, 100).toLocaleString()}`); console.log("\n─── Document Length Distribution (words) ───"); console.log(` Min: ${percentile(wordsSorted, 0).toLocaleString()}`); console.log(` P5: ${percentile(wordsSorted, 5).toLocaleString()}`); console.log(` Median: ${percentile(wordsSorted, 50).toLocaleString()}`); console.log(` Mean: ${Math.round(mean(wordsSorted)).toLocaleString()}`); console.log(` P95: ${percentile(wordsSorted, 95).toLocaleString()}`); console.log(` Max: ${percentile(wordsSorted, 100).toLocaleString()}`); // ── Token length distribution ─────────────────────────────────────────── const tokensSorted = docs.map((d) => Math.round(d.chars / CHARS_PER_TOKEN)).sort((a, b) => a - b); console.log("\n─── Token Length Distribution (estimated) ───"); console.log(` Min: ${percentile(tokensSorted, 0).toLocaleString()}`); console.log(` P5: ${percentile(tokensSorted, 5).toLocaleString()}`); console.log(` P10: ${percentile(tokensSorted, 10).toLocaleString()}`); console.log(` P25: ${percentile(tokensSorted, 25).toLocaleString()}`); console.log(` Median: ${percentile(tokensSorted, 50).toLocaleString()}`); console.log(` Mean: ${Math.round(mean(tokensSorted)).toLocaleString()}`); console.log(` P75: ${percentile(tokensSorted, 75).toLocaleString()}`); console.log(` P90: ${percentile(tokensSorted, 90).toLocaleString()}`); console.log(` P95: ${percentile(tokensSorted, 95).toLocaleString()}`); console.log(` Max: ${percentile(tokensSorted, 100).toLocaleString()}`); // ── Sequence count at different max_seq_length ────────────────────────── console.log("\n─── Training Sequences by max_seq_length ───"); for (const seqLen of [512, 1024, 2048, 4096, 8192]) { let totalSeqs = 0; for (const d of docs) { const tokens = Math.round(d.chars / CHARS_PER_TOKEN); totalSeqs += Math.ceil(tokens / seqLen); } const docsExceeding = docs.filter((d) => Math.round(d.chars / CHARS_PER_TOKEN) > seqLen).length; console.log( ` ${String(seqLen).padStart(5)}: ${totalSeqs.toLocaleString().padStart(10)} sequences` + ` (${docsExceeding.toLocaleString()} docs exceed, ${((docsExceeding / docs.length) * 100).toFixed(1)}%)`, ); } // ── Filter candidates ─────────────────────────────────────────────────── const tiny = docs.filter((d) => d.chars < 10_000); const small = docs.filter((d) => d.chars < 50_000); const empty = docs.filter((d) => d.chars < 100); const huge = docs.filter((d) => d.chars > 5_000_000); console.log("\n─── Filter Candidates ───"); console.log(` <100 chars (empty): ${empty.length}`); console.log(` <10K chars (covers): ${tiny.length} (${(tiny.reduce((s, d) => s + d.chars, 0) / totalChars * 100).toFixed(3)}% of corpus)`); console.log(` <50K chars (small): ${small.length} (${(small.reduce((s, d) => s + d.chars, 0) / totalChars * 100).toFixed(3)}% of corpus)`); console.log(` >5M chars (huge): ${huge.length}`); if (tiny.length > 0 && tiny.length <= 20) { console.log("\n Tiny documents (<10K chars):"); for (const d of tiny.sort((a, b) => a.chars - b.chars)) { console.log(` ${d.accession}: ${d.chars.toLocaleString()} chars, ${d.words.toLocaleString()} words`); } } // ── Content quality spot checks ───────────────────────────────────────── console.log("\n─── Content Quality Checks ───"); // Check for residual HTML tags let docsWithHtml = 0; let docsWithXbrl = 0; let docsWithPageNums = 0; let docsWithUrls = 0; let singleBlockDocs = 0; for (const shard of shards) { const path = `${CORPUS_DIR}/${shard}`; for (const line of readFileSync(path, "utf-8").split("\n")) { if (!line.trim()) continue; const doc = JSON.parse(line) as Doc; if (/<[a-z][^>]*>/i.test(doc.text)) docsWithHtml++; if (/ix:|xbrl|xmlns/i.test(doc.text)) docsWithXbrl++; if (/\n\s*(?:\d{1,3}|[- ]\d{1,3}[- ]|F-\d+)\s*\n/.test(doc.text)) docsWithPageNums++; if (/https?:\/\//.test(doc.text)) docsWithUrls++; if (doc.text.split("\n\n").length < 3) singleBlockDocs++; } } console.log(` Residual HTML tags: ${docsWithHtml} docs (${((docsWithHtml / docs.length) * 100).toFixed(1)}%)`); console.log(` XBRL/xmlns traces: ${docsWithXbrl} docs (${((docsWithXbrl / docs.length) * 100).toFixed(1)}%)`); console.log(` Page number traces: ${docsWithPageNums} docs (${((docsWithPageNums / docs.length) * 100).toFixed(1)}%)`); console.log(` URLs present: ${docsWithUrls} docs (${((docsWithUrls / docs.length) * 100).toFixed(1)}%)`); console.log(` Single-block (<3¶): ${singleBlockDocs} docs`); // ── Shard distribution ────────────────────────────────────────────────── console.log("\n─── Shard Distribution ───"); let shardIdx = 0; for (const shard of shards) { const path = `${CORPUS_DIR}/${shard}`; const lines = readFileSync(path, "utf-8").split("\n").filter((l) => l.trim()).length; const sizeBytes = readFileSync(path).length; console.log( ` ${shard}: ${lines.toLocaleString().padStart(6)} docs, ${(sizeBytes / 1e6).toFixed(0).padStart(4)} MB`, ); shardIdx++; } // ── Post-filter stats ─────────────────────────────────────────────────── const filtered = docs.filter((d) => d.chars >= 10_000); const filteredChars = filtered.reduce((s, d) => s + d.chars, 0); const filteredTokens = Math.round(filteredChars / CHARS_PER_TOKEN); console.log("\n─── After Filtering <10K chars ───"); console.log(` Documents: ${filtered.length.toLocaleString()} (removed ${docs.length - filtered.length})`); console.log(` Total chars: ${(filteredChars / 1e9).toFixed(3)}B`); console.log(` Total tokens (est): ${(filteredTokens / 1e6).toFixed(1)}M`); console.log(` Token loss: ${((1 - filteredTokens / totalTokens) * 100).toFixed(3)}%`); // ── Training time estimates ───────────────────────────────────────────── console.log("\n─── Training Time Estimates (RTX 3090, bf16, grad_checkpoint) ───"); for (const { seqLen, batchSize, gradAccum, secPerStepRange } of [ { seqLen: 2048, batchSize: 4, gradAccum: 8, secPerStepRange: [1.0, 1.5, 2.0] }, { seqLen: 8192, batchSize: 1, gradAccum: 32, secPerStepRange: [3.0, 5.0, 7.0] }, ]) { const totalSeqs = filtered.reduce((s, d) => s + Math.ceil(Math.round(d.chars / CHARS_PER_TOKEN) / seqLen), 0); const effectiveBatch = batchSize * gradAccum; const stepsPerEpoch = Math.ceil(totalSeqs / effectiveBatch); console.log(`\n seq_len=${seqLen}, batch=${batchSize}, grad_accum=${gradAccum} (eff=${effectiveBatch})`); console.log(` Sequences: ${totalSeqs.toLocaleString()}, Steps/epoch: ${stepsPerEpoch.toLocaleString()}`); for (const secPerStep of secPerStepRange) { const hoursPerEpoch = (stepsPerEpoch * secPerStep) / 3600; console.log(` @ ${secPerStep}s/step: ${hoursPerEpoch.toFixed(1)}h/epoch`); } }