/** * Extract styled headings (bold, underline, h-tags) from SEC filing HTML. * Produces a per-filing heading cache for paragraph heading detection. * * Usage: bun run ts/scripts/extract-html-headings.ts * * Input: data/raw/html/*.html + data/paragraphs/quality/ambiguous-filings.txt * Output: data/paragraphs/quality/filing-headings.jsonl * Each line: {"accession": "...", "headings": ["heading1", "heading2", ...]} */ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs"; import { cpus } from "node:os"; const HTML_DIR = "data/raw/html"; const FILING_LIST = "data/paragraphs/quality/ambiguous-filings.txt"; const OUTPUT = "data/paragraphs/quality/filing-headings.jsonl"; /** * Extract styled text (bold, underline, h-tags) from HTML within Item 1C. * Returns an array of heading strings found. */ function extractStyledHeadings(html: string): string[] { // Find Item 1C region (rough — look for "Item 1C" and take the next ~200KB) const item1cMatch = html.match(/item\s*1c/i); if (!item1cMatch || item1cMatch.index === undefined) return []; const startIdx = item1cMatch.index; // Look for next Item boundary or end of filing const nextItemMatch = html.slice(startIdx + 20).match(/item\s+(?:2|1[a-bd-z]|[3-9])/i); const endIdx = nextItemMatch?.index ? startIdx + 20 + nextItemMatch.index : Math.min(startIdx + 200000, html.length); const section = html.slice(startIdx, endIdx); const headings: string[] = []; // Pattern 1: or tags const boldRegex = /<(?:b|strong)[^>]*>([\s\S]*?)<\/(?:b|strong)>/gi; for (const m of section.matchAll(boldRegex)) { const text = stripTags(m[1]!).trim(); if (isHeadingCandidate(text)) headings.push(text); } // Pattern 2: font-weight: bold or font-weight: 700 in inline styles const boldStyleRegex = /<[^>]+font-weight\s*:\s*(?:bold|[6-9]00)[^>]*>([\s\S]*?)<\/[^>]+>/gi; for (const m of section.matchAll(boldStyleRegex)) { const text = stripTags(m[1]!).trim(); if (isHeadingCandidate(text)) headings.push(text); } // Pattern 3: text-decoration: underline const underlineRegex = /<[^>]+text-decoration\s*:\s*underline[^>]*>([\s\S]*?)<\/[^>]+>/gi; for (const m of section.matchAll(underlineRegex)) { const text = stripTags(m[1]!).trim(); if (isHeadingCandidate(text)) headings.push(text); } // Pattern 4: h1-h6 tags const hRegex = /]*>([\s\S]*?)<\/h[1-6]>/gi; for (const m of section.matchAll(hRegex)) { const text = stripTags(m[1]!).trim(); if (isHeadingCandidate(text)) headings.push(text); } // Deduplicate and normalize const seen = new Set(); const unique: string[] = []; for (const h of headings) { const normalized = h.replace(/\s+/g, " ").trim(); if (normalized.length < 3) continue; const key = normalized.toLowerCase(); if (!seen.has(key)) { seen.add(key); unique.push(normalized); } } return unique; } /** Strip HTML tags from a string. */ function stripTags(html: string): string { return html .replace(/<[^>]+>/g, " ") .replace(/ | /gi, " ") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/'|'/g, "'") .replace(/—|—/g, "—") .replace(/–|–/g, "–") .replace(/\s+/g, " ") .trim(); } /** Check if extracted styled text looks like a heading (not body text). */ function isHeadingCandidate(text: string): boolean { if (text.length < 3 || text.length > 150) return false; const words = text.split(/\s+/); if (words.length > 15) return false; // Must contain at least one heading-like keyword if (!/(?:risk|management|strategy|cybersecurity|cyber|governance|oversight|board|directors?|incident|response|recovery|planning|detection|program|process|third[- ]party|security|threats?|assessment|compliance|safeguards?|awareness|training|education|monitoring|integration|framework|practices|personnel|role|controls|policies|procedures|reporting|identification|disclosure|material|enterprise|technology|overview|impact|effects?|vulnerabilit)/i.test(text)) { return false; } return true; } // ─── Worker mode ─── const args = process.argv.slice(2); if (args[0] === "--worker") { const startIdx = parseInt(args[1]!); const endIdx = parseInt(args[2]!); const outFile = args[3]!; const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n").slice(startIdx, endIdx); const results: string[] = []; for (const acc of filings) { const htmlPath = `${HTML_DIR}/${acc}.html`; if (!existsSync(htmlPath)) continue; const html = readFileSync(htmlPath, "utf-8"); const headings = extractStyledHeadings(html); results.push(JSON.stringify({ accession: acc, headings })); } writeFileSync(outFile, results.join("\n") + (results.length > 0 ? "\n" : "")); process.exit(0); } // ─── Main mode ─── const start = Date.now(); const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n"); const nproc = cpus().length; const chunkSize = Math.ceil(filings.length / nproc); process.stderr.write(` ${filings.length} filings, ${nproc} workers\n`); const tmpFiles: string[] = []; const workers: ReturnType[] = []; for (let i = 0; i < nproc; i++) { const s = i * chunkSize; const e = Math.min(s + chunkSize, filings.length); if (s >= filings.length) break; const tmpFile = `${OUTPUT}.tmp-${i}`; tmpFiles.push(tmpFile); workers.push( Bun.spawn( ["bun", "run", import.meta.filename, "--worker", String(s), String(e), tmpFile], { stderr: "inherit" }, ) ); } for (const w of workers) await w.exited; // Merge const allResults: string[] = []; for (const tmp of tmpFiles) { if (existsSync(tmp)) { const content = readFileSync(tmp, "utf-8").trim(); if (content) allResults.push(content); try { require("node:fs").unlinkSync(tmp); } catch {} } } writeFileSync(OUTPUT, allResults.join("\n") + "\n"); const elapsed = ((Date.now() - start) / 1000).toFixed(1); // Count stats let totalHeadings = 0; let filingsWithHeadings = 0; for (const line of allResults.join("\n").split("\n")) { if (!line.trim()) continue; const d = JSON.parse(line); if (d.headings.length > 0) { filingsWithHeadings++; totalHeadings += d.headings.length; } } process.stderr.write( `\n Done in ${elapsed}s\n` + ` ${filings.length} filings processed\n` + ` ${filingsWithHeadings} filings with styled headings\n` + ` ${totalHeadings} total heading instances\n` + ` Output: ${OUTPUT}\n`, );