SEC-cyBERT/ts/src/extract/fast-reparse.ts
2026-03-29 20:33:39 -04:00

176 lines
7.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Fast reparse of cached HTML files. No network, no cheerio, no large lookups.
* Reads pre-built accession→metadata map and uses regex for HTML stripping.
*/
import { readdirSync, readFileSync, writeFileSync } from "node:fs";
import { segmentParagraphs } from "./segment.ts";
import { stripHtml } from "./html-cleaner.ts";
import type { FilingMeta } from "@sec-cybert/schemas/paragraph.ts";
const HTML_CACHE_DIR = "../data/raw/html";
const OUTPUT_PATH = "../data/paragraphs/paragraphs.jsonl";
const ACCESSION_META_PATH = "../data/bulk/accession-meta.json";
// ─── Item 1C extraction (regex on stripped text) ───
const ITEM_1C = /^\s*(\u2022\s*)?item\s*1c[\.\s\u00a0—:-]/i;
const NEXT_ITEM = /^\s*items?\s*(\d+[a-z]?\.?\d*)/i;
const SECTION_END = /^\s*(signatures?|part\s*(ii|2|iii|3|iv|4)|exhibit\s*index|financial\s+statements|management['']?s?\s+financial\s+discussion)/i;
// Must be ALL-CAPS and short (< 120 chars) to avoid matching body text
const SUBSIDIARY_HEADER = /^[A-Z][A-Z\s,.'&-]{5,}(?:LLC|INC|CORP|COMPANY|L\.?P\.?)\b.*\bAND\s+SUBSIDIARIES\b/;
const POST_1C_SECTION = /^\s*(properties|legal\s+proceedings|mine\s+safety|market\s+for\s+registrant|selected\s+financial|management.s\s+discussion|equity\s+compensation\s+plan|stock\s+performance)/i;
const SHORT_SECTION_HEADING = /^\s*(risk\s+factors|controls\s+and\s+procedures|unresolved\s+staff\s+comments|glossary[\s\w]*|employees?\s+and\s+(consultants|human)|subsidiaries|executive\s+officers)\s*$/i;
const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i;
const MAX_BLOCKS = 50;
const MAX_WORDS = 15000;
function extractItem1C(text: string): string | null {
// Rejoin broken headings: "ITEM\n2. PROPERTIES" → "ITEM 2. PROPERTIES"
const rawLines = text.split("\n").map((l) => l.replace(/\s+/g, " ").trim()).filter((l) => l.length > 0);
const lines: string[] = [];
for (let i = 0; i < rawLines.length; i++) {
const line = rawLines[i]!;
if (/^\s*(item|ITEM|Item)\s*$/i.test(line) && i + 1 < rawLines.length && /^\s*\d/.test(rawLines[i + 1]!)) {
lines.push(line + " " + rawLines[i + 1]!);
i++;
} else {
lines.push(line);
}
}
let startIdx = -1;
for (let i = 0; i < lines.length; i++) {
if (ITEM_1C.test(lines[i]!) && lines[i]!.length < 300) startIdx = i + 1;
}
if (startIdx === -1) return null;
let endIdx = lines.length;
for (let i = startIdx; i < lines.length; i++) {
const line = lines[i]!;
if (line.length > 300) continue;
if (SECTION_END.test(line)) { endIdx = i; break; }
if (POST_1C_SECTION.test(line) && line.length < 150) { endIdx = i; break; }
if (SHORT_SECTION_HEADING.test(line)) { endIdx = i; break; }
const m = NEXT_ITEM.exec(line);
if (m && m[1]!.toLowerCase() !== "1c" && line.length < 120) { endIdx = i; break; }
}
// First pass: strip page artifacts (headers, footers, page numbers, running titles)
const contentLines: string[] = [];
for (let i = startIdx; i < endIdx; i++) {
const line = lines[i]!;
if (line.length < 3) continue;
if (/^[-–—\s]*[A-Za-z]?[-–—]?\s*\d+[-–—\s]*$/.test(line)) continue;
if (/^page\s+\d+$/i.test(line)) continue;
if (/^table of conten\s*t?s?[\s\/]*$/i.test(line)) continue;
if (/^part\s+[iv]+$/i.test(line) && line.length < 15) continue;
if (/^\(?\s*back\s+to\s+(index|top|toc)\s*\)?$/i.test(line)) continue;
if (/^index$/i.test(line)) continue;
if (/form\s+10-[kq]/i.test(line) && line.length < 120) continue;
if (/^\d{4}\s+(form|annual)/i.test(line) && line.length < 40) continue;
if (line.length < 50 && /^[A-Z#@\s\d]+$/.test(line) && !/CYBER|SECURITY|RISK|BOARD|INCIDENT/i.test(line)) continue;
if (line.length < 120 && SUBSIDIARY_HEADER.test(line)) continue;
if (line.length < 80 && /\|/.test(line) && !/cyber|security|incident|threat/i.test(line)) continue;
if (/^\d*\s*table\s+of\s+contents\s+/i.test(line)) {
const stripped = line.replace(/^\d*\s*table\s+of\s+contents\s+/i, "").trim();
if (stripped.length >= 5 && /[.!?]/.test(stripped)) { contentLines.push(stripped); }
continue;
}
contentLines.push(line);
}
// Second pass: merge continuation lines, skip artifacts between broken sentences
const merged: string[] = [];
for (let ci = 0; ci < contentLines.length; ci++) {
const line = contentLines[ci]!;
const prev = merged.length > 0 ? merged[merged.length - 1]! : "";
const prevIsBroken = merged.length > 0 && !/[.!?:;\")\u201d]\s*$/.test(prev) && prev.length > 15;
if (prevIsBroken && line.length < 80 && ci + 1 < contentLines.length && /^[a-z]/.test(contentLines[ci + 1]!)) {
const hasContentWords = /\b(we|our|the|is|are|has|have|its|this|each|all|any|not|may|can|will|such|including|cybersecurity|security|risk|board|management|incident|threat|assess|monitor|oversee|protect|comply)\b/i.test(line);
if (!hasContentWords) {
continue;
}
}
const lcContinuation = /^[a-z]/.test(line) && prevIsBroken;
const phraseContinuation = merged.length > 0 && CONTINUATION_RE.test(line) && prev.length > 15;
const brokenMerge = prevIsBroken && line.length > 40 &&
!/^\s*(item|part|signatures?|exhibit|properties|legal\s+proceedings)/i.test(line);
if (lcContinuation || phraseContinuation || brokenMerge) {
merged[merged.length - 1] = prev + " " + line;
} else {
merged.push(line);
}
}
const blocks: string[] = [];
let words = 0;
for (const line of merged) {
if (line.length < 5) continue;
blocks.push(line);
words += line.split(/\s+/).length;
if (blocks.length >= MAX_BLOCKS || words >= MAX_WORDS) break;
}
return blocks.length > 0 ? blocks.join("\n\n") : null;
}
// ─── Main ───
const start = Date.now();
process.stderr.write(" Loading accession metadata...\n");
const accMeta: Record<string, { companyName: string; cik: string; ticker: string; filingDate: string; fiscalYear: number }> =
JSON.parse(readFileSync(ACCESSION_META_PATH, "utf-8"));
process.stderr.write(` ${Object.keys(accMeta).length} entries\n`);
const htmlFiles = readdirSync(HTML_CACHE_DIR).filter((f) => f.endsWith(".html")).sort();
process.stderr.write(` ${htmlFiles.length} HTML files\n\n`);
const output: string[] = [];
let processed = 0;
let skipped = 0;
for (let i = 0; i < htmlFiles.length; i++) {
const file = htmlFiles[i]!;
const accession = file.replace(".html", "");
const meta = accMeta[accession];
if (!meta) { skipped++; continue; }
const html = readFileSync(`${HTML_CACHE_DIR}/${file}`, "utf-8");
const section = extractItem1C(stripHtml(html));
if (!section) { skipped++; continue; }
const filingMeta: FilingMeta = {
companyName: meta.companyName,
cik: meta.cik,
ticker: meta.ticker,
filingType: "10-K",
filingDate: meta.filingDate,
fiscalYear: meta.fiscalYear,
accessionNumber: accession,
secItem: "Item 1C",
};
const paragraphs = segmentParagraphs(section, filingMeta);
for (const p of paragraphs) output.push(JSON.stringify(p));
processed++;
if ((i + 1) % 500 === 0) {
const rate = ((processed / ((Date.now() - start) / 1000)) * 60).toFixed(0);
process.stderr.write(
`\x1b[2K\r ${i + 1}/${htmlFiles.length}${processed} parsed │ ${output.length} paras │ ${skipped} skip │ ${rate}/min`,
);
}
}
process.stderr.write("\n Writing...\n");
writeFileSync(OUTPUT_PATH, output.join("\n") + "\n");
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
process.stderr.write(
`\n ✓ Done in ${elapsed}s: ${processed} filings → ${output.length} paragraphs (${skipped} skipped)\n`,
);