SEC-cyBERT/ts/scripts/extract-html-headings.ts

/**
 * Extract styled headings (bold, underline, h-tags) from SEC filing HTML.
 * Produces a per-filing heading cache for paragraph heading detection.
 *
 * Usage: bun run ts/scripts/extract-html-headings.ts
 *
 * Input:  data/raw/html/*.html + data/paragraphs/quality/ambiguous-filings.txt
 * Output: data/paragraphs/quality/filing-headings.jsonl
 *         Each line: {"accession": "...", "headings": ["heading1", "heading2", ...]}
 */
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
import { cpus } from "node:os";

const HTML_DIR = "data/raw/html";
const FILING_LIST = "data/paragraphs/quality/ambiguous-filings.txt";
const OUTPUT = "data/paragraphs/quality/filing-headings.jsonl";

/**
 * Extract styled text (bold, underline, h-tags) from HTML within Item 1C.
 * Returns an array of heading strings found.
 */
function extractStyledHeadings(html: string): string[] {
  // Find Item 1C region (rough — look for "Item 1C" and take the next ~200KB)
  const item1cMatch = html.match(/item\s*1c/i);
  if (!item1cMatch || item1cMatch.index === undefined) return [];

  const startIdx = item1cMatch.index;
  // Look for next Item boundary or end of filing
  const nextItemMatch = html.slice(startIdx + 20).match(/item\s+(?:2|1[a-bd-z]|[3-9])/i);
  const endIdx = nextItemMatch?.index
    ? startIdx + 20 + nextItemMatch.index
    : Math.min(startIdx + 200000, html.length);

  const section = html.slice(startIdx, endIdx);

  const headings: string[] = [];

  // Pattern 1: <b> or <strong> tags
  const boldRegex = /<(?:b|strong)[^>]*>([\s\S]*?)<\/(?:b|strong)>/gi;
  for (const m of section.matchAll(boldRegex)) {
    const text = stripTags(m[1]!).trim();
    if (isHeadingCandidate(text)) headings.push(text);
  }

  // Pattern 2: font-weight: bold or font-weight: 700 in inline styles
  const boldStyleRegex = /<[^>]+font-weight\s*:\s*(?:bold|[6-9]00)[^>]*>([\s\S]*?)<\/[^>]+>/gi;
  for (const m of section.matchAll(boldStyleRegex)) {
    const text = stripTags(m[1]!).trim();
    if (isHeadingCandidate(text)) headings.push(text);
  }

  // Pattern 3: text-decoration: underline
  const underlineRegex = /<[^>]+text-decoration\s*:\s*underline[^>]*>([\s\S]*?)<\/[^>]+>/gi;
  for (const m of section.matchAll(underlineRegex)) {
    const text = stripTags(m[1]!).trim();
    if (isHeadingCandidate(text)) headings.push(text);
  }

  // Pattern 4: h1-h6 tags
  const hRegex = /<h[1-6][^>]*>([\s\S]*?)<\/h[1-6]>/gi;
  for (const m of section.matchAll(hRegex)) {
    const text = stripTags(m[1]!).trim();
    if (isHeadingCandidate(text)) headings.push(text);
  }

  // Deduplicate and normalize
  const seen = new Set<string>();
  const unique: string[] = [];
  for (const h of headings) {
    const normalized = h.replace(/\s+/g, " ").trim();
    if (normalized.length < 3) continue;
    const key = normalized.toLowerCase();
    if (!seen.has(key)) {
      seen.add(key);
      unique.push(normalized);
    }
  }

  return unique;
}

/** Strip HTML tags from a string. */
function stripTags(html: string): string {
  return html
    .replace(/<[^>]+>/g, " ")
    .replace(/&nbsp;|&#160;/gi, " ")
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&#39;|&apos;/g, "'")
    .replace(/&mdash;|&#8212;/g, "—")
    .replace(/&ndash;|&#8211;/g, "–")
    .replace(/\s+/g, " ")
    .trim();
}

/** Check if extracted styled text looks like a heading (not body text). */
function isHeadingCandidate(text: string): boolean {
  if (text.length < 3 || text.length > 150) return false;
  const words = text.split(/\s+/);
  if (words.length > 15) return false;
  // Must contain at least one heading-like keyword
  if (!/(?:risk|management|strategy|cybersecurity|cyber|governance|oversight|board|directors?|incident|response|recovery|planning|detection|program|process|third[- ]party|security|threats?|assessment|compliance|safeguards?|awareness|training|education|monitoring|integration|framework|practices|personnel|role|controls|policies|procedures|reporting|identification|disclosure|material|enterprise|technology|overview|impact|effects?|vulnerabilit)/i.test(text)) {
    return false;
  }
  return true;
}

// ─── Worker mode ───
const args = process.argv.slice(2);
if (args[0] === "--worker") {
  const startIdx = parseInt(args[1]!);
  const endIdx = parseInt(args[2]!);
  const outFile = args[3]!;
  const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n").slice(startIdx, endIdx);

  const results: string[] = [];
  for (const acc of filings) {
    const htmlPath = `${HTML_DIR}/${acc}.html`;
    if (!existsSync(htmlPath)) continue;
    const html = readFileSync(htmlPath, "utf-8");
    const headings = extractStyledHeadings(html);
    results.push(JSON.stringify({ accession: acc, headings }));
  }

  writeFileSync(outFile, results.join("\n") + (results.length > 0 ? "\n" : ""));
  process.exit(0);
}

// ─── Main mode ───
const start = Date.now();
const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n");
const nproc = cpus().length;
const chunkSize = Math.ceil(filings.length / nproc);

process.stderr.write(`  ${filings.length} filings, ${nproc} workers\n`);

const tmpFiles: string[] = [];
const workers: ReturnType<typeof Bun.spawn>[] = [];

for (let i = 0; i < nproc; i++) {
  const s = i * chunkSize;
  const e = Math.min(s + chunkSize, filings.length);
  if (s >= filings.length) break;
  const tmpFile = `${OUTPUT}.tmp-${i}`;
  tmpFiles.push(tmpFile);
  workers.push(
    Bun.spawn(
      ["bun", "run", import.meta.filename, "--worker", String(s), String(e), tmpFile],
      { stderr: "inherit" },
    )
  );
}

for (const w of workers) await w.exited;

// Merge
const allResults: string[] = [];
for (const tmp of tmpFiles) {
  if (existsSync(tmp)) {
    const content = readFileSync(tmp, "utf-8").trim();
    if (content) allResults.push(content);
    try { require("node:fs").unlinkSync(tmp); } catch {}
  }
}

writeFileSync(OUTPUT, allResults.join("\n") + "\n");

const elapsed = ((Date.now() - start) / 1000).toFixed(1);

// Count stats
let totalHeadings = 0;
let filingsWithHeadings = 0;
for (const line of allResults.join("\n").split("\n")) {
  if (!line.trim()) continue;
  const d = JSON.parse(line);
  if (d.headings.length > 0) {
    filingsWithHeadings++;
    totalHeadings += d.headings.length;
  }
}

process.stderr.write(
  `\n  Done in ${elapsed}s\n` +
  `  ${filings.length} filings processed\n` +
  `  ${filingsWithHeadings} filings with styled headings\n` +
  `  ${totalHeadings} total heading instances\n` +
  `  Output: ${OUTPUT}\n`,
);