191 lines
6.5 KiB
TypeScript
191 lines
6.5 KiB
TypeScript
/**
|
||
* Extract styled headings (bold, underline, h-tags) from SEC filing HTML.
|
||
* Produces a per-filing heading cache for paragraph heading detection.
|
||
*
|
||
* Usage: bun run ts/scripts/extract-html-headings.ts
|
||
*
|
||
* Input: data/raw/html/*.html + data/paragraphs/quality/ambiguous-filings.txt
|
||
* Output: data/paragraphs/quality/filing-headings.jsonl
|
||
* Each line: {"accession": "...", "headings": ["heading1", "heading2", ...]}
|
||
*/
|
||
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
|
||
import { cpus } from "node:os";
|
||
|
||
const HTML_DIR = "data/raw/html";
|
||
const FILING_LIST = "data/paragraphs/quality/ambiguous-filings.txt";
|
||
const OUTPUT = "data/paragraphs/quality/filing-headings.jsonl";
|
||
|
||
/**
|
||
* Extract styled text (bold, underline, h-tags) from HTML within Item 1C.
|
||
* Returns an array of heading strings found.
|
||
*/
|
||
function extractStyledHeadings(html: string): string[] {
|
||
// Find Item 1C region (rough — look for "Item 1C" and take the next ~200KB)
|
||
const item1cMatch = html.match(/item\s*1c/i);
|
||
if (!item1cMatch || item1cMatch.index === undefined) return [];
|
||
|
||
const startIdx = item1cMatch.index;
|
||
// Look for next Item boundary or end of filing
|
||
const nextItemMatch = html.slice(startIdx + 20).match(/item\s+(?:2|1[a-bd-z]|[3-9])/i);
|
||
const endIdx = nextItemMatch?.index
|
||
? startIdx + 20 + nextItemMatch.index
|
||
: Math.min(startIdx + 200000, html.length);
|
||
|
||
const section = html.slice(startIdx, endIdx);
|
||
|
||
const headings: string[] = [];
|
||
|
||
// Pattern 1: <b> or <strong> tags
|
||
const boldRegex = /<(?:b|strong)[^>]*>([\s\S]*?)<\/(?:b|strong)>/gi;
|
||
for (const m of section.matchAll(boldRegex)) {
|
||
const text = stripTags(m[1]!).trim();
|
||
if (isHeadingCandidate(text)) headings.push(text);
|
||
}
|
||
|
||
// Pattern 2: font-weight: bold or font-weight: 700 in inline styles
|
||
const boldStyleRegex = /<[^>]+font-weight\s*:\s*(?:bold|[6-9]00)[^>]*>([\s\S]*?)<\/[^>]+>/gi;
|
||
for (const m of section.matchAll(boldStyleRegex)) {
|
||
const text = stripTags(m[1]!).trim();
|
||
if (isHeadingCandidate(text)) headings.push(text);
|
||
}
|
||
|
||
// Pattern 3: text-decoration: underline
|
||
const underlineRegex = /<[^>]+text-decoration\s*:\s*underline[^>]*>([\s\S]*?)<\/[^>]+>/gi;
|
||
for (const m of section.matchAll(underlineRegex)) {
|
||
const text = stripTags(m[1]!).trim();
|
||
if (isHeadingCandidate(text)) headings.push(text);
|
||
}
|
||
|
||
// Pattern 4: h1-h6 tags
|
||
const hRegex = /<h[1-6][^>]*>([\s\S]*?)<\/h[1-6]>/gi;
|
||
for (const m of section.matchAll(hRegex)) {
|
||
const text = stripTags(m[1]!).trim();
|
||
if (isHeadingCandidate(text)) headings.push(text);
|
||
}
|
||
|
||
// Deduplicate and normalize
|
||
const seen = new Set<string>();
|
||
const unique: string[] = [];
|
||
for (const h of headings) {
|
||
const normalized = h.replace(/\s+/g, " ").trim();
|
||
if (normalized.length < 3) continue;
|
||
const key = normalized.toLowerCase();
|
||
if (!seen.has(key)) {
|
||
seen.add(key);
|
||
unique.push(normalized);
|
||
}
|
||
}
|
||
|
||
return unique;
|
||
}
|
||
|
||
/** Strip HTML tags from a string. */
|
||
function stripTags(html: string): string {
|
||
return html
|
||
.replace(/<[^>]+>/g, " ")
|
||
.replace(/ | /gi, " ")
|
||
.replace(/&/g, "&")
|
||
.replace(/</g, "<")
|
||
.replace(/>/g, ">")
|
||
.replace(/"/g, '"')
|
||
.replace(/'|'/g, "'")
|
||
.replace(/—|—/g, "—")
|
||
.replace(/–|–/g, "–")
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
}
|
||
|
||
/** Check if extracted styled text looks like a heading (not body text). */
|
||
function isHeadingCandidate(text: string): boolean {
|
||
if (text.length < 3 || text.length > 150) return false;
|
||
const words = text.split(/\s+/);
|
||
if (words.length > 15) return false;
|
||
// Must contain at least one heading-like keyword
|
||
if (!/(?:risk|management|strategy|cybersecurity|cyber|governance|oversight|board|directors?|incident|response|recovery|planning|detection|program|process|third[- ]party|security|threats?|assessment|compliance|safeguards?|awareness|training|education|monitoring|integration|framework|practices|personnel|role|controls|policies|procedures|reporting|identification|disclosure|material|enterprise|technology|overview|impact|effects?|vulnerabilit)/i.test(text)) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
// ─── Worker mode ───
|
||
const args = process.argv.slice(2);
|
||
if (args[0] === "--worker") {
|
||
const startIdx = parseInt(args[1]!);
|
||
const endIdx = parseInt(args[2]!);
|
||
const outFile = args[3]!;
|
||
const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n").slice(startIdx, endIdx);
|
||
|
||
const results: string[] = [];
|
||
for (const acc of filings) {
|
||
const htmlPath = `${HTML_DIR}/${acc}.html`;
|
||
if (!existsSync(htmlPath)) continue;
|
||
const html = readFileSync(htmlPath, "utf-8");
|
||
const headings = extractStyledHeadings(html);
|
||
results.push(JSON.stringify({ accession: acc, headings }));
|
||
}
|
||
|
||
writeFileSync(outFile, results.join("\n") + (results.length > 0 ? "\n" : ""));
|
||
process.exit(0);
|
||
}
|
||
|
||
// ─── Main mode ───
|
||
const start = Date.now();
|
||
const filings = readFileSync(FILING_LIST, "utf-8").trim().split("\n");
|
||
const nproc = cpus().length;
|
||
const chunkSize = Math.ceil(filings.length / nproc);
|
||
|
||
process.stderr.write(` ${filings.length} filings, ${nproc} workers\n`);
|
||
|
||
const tmpFiles: string[] = [];
|
||
const workers: ReturnType<typeof Bun.spawn>[] = [];
|
||
|
||
for (let i = 0; i < nproc; i++) {
|
||
const s = i * chunkSize;
|
||
const e = Math.min(s + chunkSize, filings.length);
|
||
if (s >= filings.length) break;
|
||
const tmpFile = `${OUTPUT}.tmp-${i}`;
|
||
tmpFiles.push(tmpFile);
|
||
workers.push(
|
||
Bun.spawn(
|
||
["bun", "run", import.meta.filename, "--worker", String(s), String(e), tmpFile],
|
||
{ stderr: "inherit" },
|
||
)
|
||
);
|
||
}
|
||
|
||
for (const w of workers) await w.exited;
|
||
|
||
// Merge
|
||
const allResults: string[] = [];
|
||
for (const tmp of tmpFiles) {
|
||
if (existsSync(tmp)) {
|
||
const content = readFileSync(tmp, "utf-8").trim();
|
||
if (content) allResults.push(content);
|
||
try { require("node:fs").unlinkSync(tmp); } catch {}
|
||
}
|
||
}
|
||
|
||
writeFileSync(OUTPUT, allResults.join("\n") + "\n");
|
||
|
||
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
|
||
|
||
// Count stats
|
||
let totalHeadings = 0;
|
||
let filingsWithHeadings = 0;
|
||
for (const line of allResults.join("\n").split("\n")) {
|
||
if (!line.trim()) continue;
|
||
const d = JSON.parse(line);
|
||
if (d.headings.length > 0) {
|
||
filingsWithHeadings++;
|
||
totalHeadings += d.headings.length;
|
||
}
|
||
}
|
||
|
||
process.stderr.write(
|
||
`\n Done in ${elapsed}s\n` +
|
||
` ${filings.length} filings processed\n` +
|
||
` ${filingsWithHeadings} filings with styled headings\n` +
|
||
` ${totalHeadings} total heading instances\n` +
|
||
` Output: ${OUTPUT}\n`,
|
||
);
|