SEC-cyBERT/ts/scripts/dapt-corpus-analytics.ts

/**
 * DAPT corpus analytics: document length distribution, token estimates,
 * quality checks, and filter candidates.
 *
 * Usage: bun ts/scripts/dapt-corpus-analytics.ts
 *
 * Input: data/dapt-corpus/shard-*.jsonl
 */
import { readFileSync, readdirSync } from "node:fs";

const CORPUS_DIR = new URL("../../data/dapt-corpus", import.meta.url).pathname;
const CHARS_PER_TOKEN = 4.72; // empirical from ModernBERT tokenizer

interface Doc {
  accession: string;
  text: string;
}

// ── Load all documents ──────────────────────────────────────────────────
console.error("Loading corpus...");
const shards = readdirSync(CORPUS_DIR)
  .filter((f) => f.endsWith(".jsonl"))
  .sort();

const docs: { accession: string; chars: number; lines: number; words: number }[] = [];
let totalChars = 0;

for (const shard of shards) {
  const path = `${CORPUS_DIR}/${shard}`;
  for (const line of readFileSync(path, "utf-8").split("\n")) {
    if (!line.trim()) continue;
    const doc = JSON.parse(line) as Doc;
    const chars = doc.text.length;
    const lines = doc.text.split("\n").length;
    const words = doc.text.split(/\s+/).filter(Boolean).length;
    docs.push({ accession: doc.accession, chars, lines, words });
    totalChars += chars;
  }
}

console.error(`  ${docs.length} documents loaded from ${shards.length} shards\n`);

// ── Basic stats ─────────────────────────────────────────────────────────
const charsSorted = docs.map((d) => d.chars).sort((a, b) => a - b);
const wordsSorted = docs.map((d) => d.words).sort((a, b) => a - b);

function percentile(arr: number[], p: number): number {
  const idx = Math.ceil((p / 100) * arr.length) - 1;
  return arr[Math.max(0, idx)]!;
}

function mean(arr: number[]): number {
  return arr.reduce((a, b) => a + b, 0) / arr.length;
}

const totalTokens = Math.round(totalChars / CHARS_PER_TOKEN);

console.log("═══ DAPT CORPUS ANALYTICS ═══\n");
console.log("─── Overview ───");
console.log(`  Documents: ${docs.length.toLocaleString()}`);
console.log(`  Shards: ${shards.length}`);
console.log(`  Total chars: ${(totalChars / 1e9).toFixed(3)}B`);
console.log(`  Total tokens (est): ${(totalTokens / 1e6).toFixed(1)}M (@ ${CHARS_PER_TOKEN} chars/token)`);

console.log("\n─── Document Length Distribution (chars) ───");
console.log(`  Min:    ${percentile(charsSorted, 0).toLocaleString()}`);
console.log(`  P5:     ${percentile(charsSorted, 5).toLocaleString()}`);
console.log(`  P10:    ${percentile(charsSorted, 10).toLocaleString()}`);
console.log(`  P25:    ${percentile(charsSorted, 25).toLocaleString()}`);
console.log(`  Median: ${percentile(charsSorted, 50).toLocaleString()}`);
console.log(`  Mean:   ${Math.round(mean(charsSorted)).toLocaleString()}`);
console.log(`  P75:    ${percentile(charsSorted, 75).toLocaleString()}`);
console.log(`  P90:    ${percentile(charsSorted, 90).toLocaleString()}`);
console.log(`  P95:    ${percentile(charsSorted, 95).toLocaleString()}`);
console.log(`  Max:    ${percentile(charsSorted, 100).toLocaleString()}`);

console.log("\n─── Document Length Distribution (words) ───");
console.log(`  Min:    ${percentile(wordsSorted, 0).toLocaleString()}`);
console.log(`  P5:     ${percentile(wordsSorted, 5).toLocaleString()}`);
console.log(`  Median: ${percentile(wordsSorted, 50).toLocaleString()}`);
console.log(`  Mean:   ${Math.round(mean(wordsSorted)).toLocaleString()}`);
console.log(`  P95:    ${percentile(wordsSorted, 95).toLocaleString()}`);
console.log(`  Max:    ${percentile(wordsSorted, 100).toLocaleString()}`);

// ── Token length distribution ───────────────────────────────────────────
const tokensSorted = docs.map((d) => Math.round(d.chars / CHARS_PER_TOKEN)).sort((a, b) => a - b);

console.log("\n─── Token Length Distribution (estimated) ───");
console.log(`  Min:    ${percentile(tokensSorted, 0).toLocaleString()}`);
console.log(`  P5:     ${percentile(tokensSorted, 5).toLocaleString()}`);
console.log(`  P10:    ${percentile(tokensSorted, 10).toLocaleString()}`);
console.log(`  P25:    ${percentile(tokensSorted, 25).toLocaleString()}`);
console.log(`  Median: ${percentile(tokensSorted, 50).toLocaleString()}`);
console.log(`  Mean:   ${Math.round(mean(tokensSorted)).toLocaleString()}`);
console.log(`  P75:    ${percentile(tokensSorted, 75).toLocaleString()}`);
console.log(`  P90:    ${percentile(tokensSorted, 90).toLocaleString()}`);
console.log(`  P95:    ${percentile(tokensSorted, 95).toLocaleString()}`);
console.log(`  Max:    ${percentile(tokensSorted, 100).toLocaleString()}`);

// ── Sequence count at different max_seq_length ──────────────────────────
console.log("\n─── Training Sequences by max_seq_length ───");
for (const seqLen of [512, 1024, 2048, 4096, 8192]) {
  let totalSeqs = 0;
  for (const d of docs) {
    const tokens = Math.round(d.chars / CHARS_PER_TOKEN);
    totalSeqs += Math.ceil(tokens / seqLen);
  }
  const docsExceeding = docs.filter((d) => Math.round(d.chars / CHARS_PER_TOKEN) > seqLen).length;
  console.log(
    `  ${String(seqLen).padStart(5)}: ${totalSeqs.toLocaleString().padStart(10)} sequences` +
      `  (${docsExceeding.toLocaleString()} docs exceed, ${((docsExceeding / docs.length) * 100).toFixed(1)}%)`,
  );
}

// ── Filter candidates ───────────────────────────────────────────────────
const tiny = docs.filter((d) => d.chars < 10_000);
const small = docs.filter((d) => d.chars < 50_000);
const empty = docs.filter((d) => d.chars < 100);
const huge = docs.filter((d) => d.chars > 5_000_000);

console.log("\n─── Filter Candidates ───");
console.log(`  <100 chars (empty):    ${empty.length}`);
console.log(`  <10K chars (covers):   ${tiny.length} (${(tiny.reduce((s, d) => s + d.chars, 0) / totalChars * 100).toFixed(3)}% of corpus)`);
console.log(`  <50K chars (small):    ${small.length} (${(small.reduce((s, d) => s + d.chars, 0) / totalChars * 100).toFixed(3)}% of corpus)`);
console.log(`  >5M chars (huge):      ${huge.length}`);

if (tiny.length > 0 && tiny.length <= 20) {
  console.log("\n  Tiny documents (<10K chars):");
  for (const d of tiny.sort((a, b) => a.chars - b.chars)) {
    console.log(`    ${d.accession}: ${d.chars.toLocaleString()} chars, ${d.words.toLocaleString()} words`);
  }
}

// ── Content quality spot checks ─────────────────────────────────────────
console.log("\n─── Content Quality Checks ───");

// Check for residual HTML tags
let docsWithHtml = 0;
let docsWithXbrl = 0;
let docsWithPageNums = 0;
let docsWithUrls = 0;
let singleBlockDocs = 0;

for (const shard of shards) {
  const path = `${CORPUS_DIR}/${shard}`;
  for (const line of readFileSync(path, "utf-8").split("\n")) {
    if (!line.trim()) continue;
    const doc = JSON.parse(line) as Doc;
    if (/<[a-z][^>]*>/i.test(doc.text)) docsWithHtml++;
    if (/ix:|xbrl|xmlns/i.test(doc.text)) docsWithXbrl++;
    if (/\n\s*(?:\d{1,3}|[- ]\d{1,3}[- ]|F-\d+)\s*\n/.test(doc.text)) docsWithPageNums++;
    if (/https?:\/\//.test(doc.text)) docsWithUrls++;
    if (doc.text.split("\n\n").length < 3) singleBlockDocs++;
  }
}

console.log(`  Residual HTML tags:  ${docsWithHtml} docs (${((docsWithHtml / docs.length) * 100).toFixed(1)}%)`);
console.log(`  XBRL/xmlns traces:  ${docsWithXbrl} docs (${((docsWithXbrl / docs.length) * 100).toFixed(1)}%)`);
console.log(`  Page number traces:  ${docsWithPageNums} docs (${((docsWithPageNums / docs.length) * 100).toFixed(1)}%)`);
console.log(`  URLs present:        ${docsWithUrls} docs (${((docsWithUrls / docs.length) * 100).toFixed(1)}%)`);
console.log(`  Single-block (<3¶):  ${singleBlockDocs} docs`);

// ── Shard distribution ──────────────────────────────────────────────────
console.log("\n─── Shard Distribution ───");
let shardIdx = 0;
for (const shard of shards) {
  const path = `${CORPUS_DIR}/${shard}`;
  const lines = readFileSync(path, "utf-8").split("\n").filter((l) => l.trim()).length;
  const sizeBytes = readFileSync(path).length;
  console.log(
    `  ${shard}: ${lines.toLocaleString().padStart(6)} docs, ${(sizeBytes / 1e6).toFixed(0).padStart(4)} MB`,
  );
  shardIdx++;
}

// ── Post-filter stats ───────────────────────────────────────────────────
const filtered = docs.filter((d) => d.chars >= 10_000);
const filteredChars = filtered.reduce((s, d) => s + d.chars, 0);
const filteredTokens = Math.round(filteredChars / CHARS_PER_TOKEN);

console.log("\n─── After Filtering <10K chars ───");
console.log(`  Documents: ${filtered.length.toLocaleString()} (removed ${docs.length - filtered.length})`);
console.log(`  Total chars: ${(filteredChars / 1e9).toFixed(3)}B`);
console.log(`  Total tokens (est): ${(filteredTokens / 1e6).toFixed(1)}M`);
console.log(`  Token loss: ${((1 - filteredTokens / totalTokens) * 100).toFixed(3)}%`);

// ── Training time estimates ─────────────────────────────────────────────
console.log("\n─── Training Time Estimates (RTX 3090, bf16, grad_checkpoint) ───");
for (const { seqLen, batchSize, gradAccum, secPerStepRange } of [
  { seqLen: 2048, batchSize: 4, gradAccum: 8, secPerStepRange: [1.0, 1.5, 2.0] },
  { seqLen: 8192, batchSize: 1, gradAccum: 32, secPerStepRange: [3.0, 5.0, 7.0] },
]) {
  const totalSeqs = filtered.reduce((s, d) => s + Math.ceil(Math.round(d.chars / CHARS_PER_TOKEN) / seqLen), 0);
  const effectiveBatch = batchSize * gradAccum;
  const stepsPerEpoch = Math.ceil(totalSeqs / effectiveBatch);

  console.log(`\n  seq_len=${seqLen}, batch=${batchSize}, grad_accum=${gradAccum} (eff=${effectiveBatch})`);
  console.log(`    Sequences: ${totalSeqs.toLocaleString()}, Steps/epoch: ${stepsPerEpoch.toLocaleString()}`);
  for (const secPerStep of secPerStepRange) {
    const hoursPerEpoch = (stepsPerEpoch * secPerStep) / 3600;
    console.log(`    @ ${secPerStep}s/step: ${hoursPerEpoch.toFixed(1)}h/epoch`);
  }
}