SEC-cyBERT/ts/scripts/extract-html-headings.ts
2026-03-29 20:33:39 -04:00

191 lines
6.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Extract styled headings (bold, underline, h-tags) from SEC filing HTML.
* Produces a per-filing heading cache for paragraph heading detection.
*
* Usage: bun run ts/scripts/extract-html-headings.ts
*
* Input: data/raw/html/*.html + data/paragraphs/quality/ambiguous-filings.txt
* Output: data/paragraphs/quality/filing-headings.jsonl
* Each line: {"accession": "...", "headings": ["heading1", "heading2", ...]}
*/
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
import { cpus } from "node:os";
const HTML_DIR = "data/raw/html";
const FILING_LIST = "data/paragraphs/quality/ambiguous-filings.txt";
const OUTPUT = "data/paragraphs/quality/filing-headings.jsonl";
/**
* Extract styled text (bold, underline, h-tags) from HTML within Item 1C.
* Returns an array of heading strings found.
*/
function extractStyledHeadings(html: string): string[] {
// Find Item 1C region (rough — look for "Item 1C" and take the next ~200KB)
const item1cMatch = html.match(/item\s*1c/i);
if (!item1cMatch || item1cMatch.index === undefined) return [];
const startIdx = item1cMatch.index;
// Look for next Item boundary or end of filing
const nextItemMatch = html.slice(startIdx + 20).match(/item\s+(?:2|1[a-bd-z]|[3-9])/i);
const endIdx = nextItemMatch?.index
? startIdx + 20 + nextItemMatch.index
: Math.min(startIdx + 200000, html.length);
const section = html.slice(startIdx, endIdx);
const headings: string[] = [];
// Pattern 1: <b> or <strong> tags
const boldRegex = /<(?:b|strong)[^>]*>([\s\S]*?)<\/(?:b|strong)>/gi;
for (const m of section.matchAll(boldRegex)) {
const text = stripTags(m[1]!).trim();
if (isHeadingCandidate(text)) headings.push(text);
}
// Pattern 2: font-weight: bold or font-weight: 700 in inline styles
const boldStyleRegex = /<[^>]+font-weight\s*:\s*(?:bold|[6-9]00)[^>]*>([\s\S]*?)<\/[^>]+>/gi;
for (const m of section.matchAll(boldStyleRegex)) {
const text = stripTags(m[1]!).trim();
if (isHeadingCandidate(text)) headings.push(text);
}
// Pattern 3: text-decoration: underline
const underlineRegex = /<[^>]+text-decoration\s*:\s*underline[^>]*>([\s\S]*?)<\/[^>]+>/gi;
for (const m of section.matchAll(underlineRegex)) {
const text = stripTags(m[1]!).trim();
if (isHeadingCandidate(text)) headings.push(text);
}
// Pattern 4: h1-h6 tags
const hRegex = /<h[1-6][^>]*>([\s\S]*?)<\/h[1-6]>/gi;
for (const m of section.matchAll(hRegex)) {
const text = stripTags(m[1]!).trim();
if (isHeadingCandidate(text)) headings.push(text);
}
// Deduplicate and normalize
const seen = new Set<string>();
const unique: string[] = [];
for (const h of headings) {
const normalized = h.replace(/\s+/g, " ").trim();
if (normalized.length < 3) continue;
const key = normalized.toLowerCase();
if (!seen.has(key)) {
seen.add(key);
unique.push(normalized);
}
}
return unique;
}
/** Strip HTML tags from a string. */
function stripTags(html: string): string {
return html
.replace(/<[^>]+>/g, " ")
.replace(/&nbsp;|&#160;/gi, " ")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;|&apos;/g, "'")
.replace(/&mdash;|&#8212;/g, "—")
.replace(/&ndash;|&#8211;/g, "")
.replace(/\s+/g, " ")
.trim();
}
/** Check if extracted styled text looks like a heading (not body text). */
function isHeadingCandidate(text: string): boolean {
if (text.length < 3 || text.length > 150) return false;
const words = text.split(/\s+/);
if (words.length > 15) return false;
// Must contain at least one heading-like keyword
if (!/(?:risk|management|strategy|cybersecurity|cyber|governance|oversight|board|directors?|incident|response|recovery|planning|detection|program|process|third[- ]party|security|threats?|assessment|compliance|safeguards?|awareness|training|education|monitoring|integration|framework|practices|personnel|role|controls|policies|procedures|reporting|identification|disclosure|material|enterprise|technology|overview|impact|effects?|vulnerabilit)/i.test(text)) {
return false;
}
return true;
}
// ─── Worker mode ───
const args = process.argv.slice(2);
if (args[0] === "--worker") {
const startIdx = parseInt(args[1]!);
const endIdx = parseInt(args[2]!);
const outFile = args[3]!;
const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n").slice(startIdx, endIdx);
const results: string[] = [];
for (const acc of filings) {
const htmlPath = `${HTML_DIR}/${acc}.html`;
if (!existsSync(htmlPath)) continue;
const html = readFileSync(htmlPath, "utf-8");
const headings = extractStyledHeadings(html);
results.push(JSON.stringify({ accession: acc, headings }));
}
writeFileSync(outFile, results.join("\n") + (results.length > 0 ? "\n" : ""));
process.exit(0);
}
// ─── Main mode ───
const start = Date.now();
const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n");
const nproc = cpus().length;
const chunkSize = Math.ceil(filings.length / nproc);
process.stderr.write(` ${filings.length} filings, ${nproc} workers\n`);
const tmpFiles: string[] = [];
const workers: ReturnType<typeof Bun.spawn>[] = [];
for (let i = 0; i < nproc; i++) {
const s = i * chunkSize;
const e = Math.min(s + chunkSize, filings.length);
if (s >= filings.length) break;
const tmpFile = `${OUTPUT}.tmp-${i}`;
tmpFiles.push(tmpFile);
workers.push(
Bun.spawn(
["bun", "run", import.meta.filename, "--worker", String(s), String(e), tmpFile],
{ stderr: "inherit" },
)
);
}
for (const w of workers) await w.exited;
// Merge
const allResults: string[] = [];
for (const tmp of tmpFiles) {
if (existsSync(tmp)) {
const content = readFileSync(tmp, "utf-8").trim();
if (content) allResults.push(content);
try { require("node:fs").unlinkSync(tmp); } catch {}
}
}
writeFileSync(OUTPUT, allResults.join("\n") + "\n");
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
// Count stats
let totalHeadings = 0;
let filingsWithHeadings = 0;
for (const line of allResults.join("\n").split("\n")) {
if (!line.trim()) continue;
const d = JSON.parse(line);
if (d.headings.length > 0) {
filingsWithHeadings++;
totalHeadings += d.headings.length;
}
}
process.stderr.write(
`\n Done in ${elapsed}s\n` +
` ${filings.length} filings processed\n` +
` ${filingsWithHeadings} filings with styled headings\n` +
` ${totalHeadings} total heading instances\n` +
` Output: ${OUTPUT}\n`,
);