176 lines
7.6 KiB
TypeScript
176 lines
7.6 KiB
TypeScript
/**
|
||
* Fast reparse of cached HTML files. No network, no cheerio, no large lookups.
|
||
* Reads pre-built accession→metadata map and uses regex for HTML stripping.
|
||
*/
|
||
import { readdirSync, readFileSync, writeFileSync } from "node:fs";
|
||
import { segmentParagraphs } from "./segment.ts";
|
||
import { stripHtml } from "./html-cleaner.ts";
|
||
import type { FilingMeta } from "@sec-cybert/schemas/paragraph.ts";
|
||
|
||
const HTML_CACHE_DIR = "../data/raw/html";
|
||
const OUTPUT_PATH = "../data/paragraphs/paragraphs.jsonl";
|
||
const ACCESSION_META_PATH = "../data/bulk/accession-meta.json";
|
||
|
||
// ─── Item 1C extraction (regex on stripped text) ───
|
||
|
||
const ITEM_1C = /^\s*(\u2022\s*)?item\s*1c[\.\s\u00a0—–:-]/i;
|
||
const NEXT_ITEM = /^\s*items?\s*(\d+[a-z]?\.?\d*)/i;
|
||
const SECTION_END = /^\s*(signatures?|part\s*(ii|2|iii|3|iv|4)|exhibit\s*index|financial\s+statements|management['']?s?\s+financial\s+discussion)/i;
|
||
// Must be ALL-CAPS and short (< 120 chars) to avoid matching body text
|
||
const SUBSIDIARY_HEADER = /^[A-Z][A-Z\s,.'&-]{5,}(?:LLC|INC|CORP|COMPANY|L\.?P\.?)\b.*\bAND\s+SUBSIDIARIES\b/;
|
||
const POST_1C_SECTION = /^\s*(properties|legal\s+proceedings|mine\s+safety|market\s+for\s+registrant|selected\s+financial|management.s\s+discussion|equity\s+compensation\s+plan|stock\s+performance)/i;
|
||
const SHORT_SECTION_HEADING = /^\s*(risk\s+factors|controls\s+and\s+procedures|unresolved\s+staff\s+comments|glossary[\s\w]*|employees?\s+and\s+(consultants|human)|subsidiaries|executive\s+officers)\s*$/i;
|
||
const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i;
|
||
const MAX_BLOCKS = 50;
|
||
const MAX_WORDS = 15000;
|
||
|
||
function extractItem1C(text: string): string | null {
|
||
// Rejoin broken headings: "ITEM\n2. PROPERTIES" → "ITEM 2. PROPERTIES"
|
||
const rawLines = text.split("\n").map((l) => l.replace(/\s+/g, " ").trim()).filter((l) => l.length > 0);
|
||
const lines: string[] = [];
|
||
for (let i = 0; i < rawLines.length; i++) {
|
||
const line = rawLines[i]!;
|
||
if (/^\s*(item|ITEM|Item)\s*$/i.test(line) && i + 1 < rawLines.length && /^\s*\d/.test(rawLines[i + 1]!)) {
|
||
lines.push(line + " " + rawLines[i + 1]!);
|
||
i++;
|
||
} else {
|
||
lines.push(line);
|
||
}
|
||
}
|
||
|
||
let startIdx = -1;
|
||
for (let i = 0; i < lines.length; i++) {
|
||
if (ITEM_1C.test(lines[i]!) && lines[i]!.length < 300) startIdx = i + 1;
|
||
}
|
||
if (startIdx === -1) return null;
|
||
|
||
let endIdx = lines.length;
|
||
for (let i = startIdx; i < lines.length; i++) {
|
||
const line = lines[i]!;
|
||
if (line.length > 300) continue;
|
||
if (SECTION_END.test(line)) { endIdx = i; break; }
|
||
if (POST_1C_SECTION.test(line) && line.length < 150) { endIdx = i; break; }
|
||
if (SHORT_SECTION_HEADING.test(line)) { endIdx = i; break; }
|
||
const m = NEXT_ITEM.exec(line);
|
||
if (m && m[1]!.toLowerCase() !== "1c" && line.length < 120) { endIdx = i; break; }
|
||
}
|
||
|
||
// First pass: strip page artifacts (headers, footers, page numbers, running titles)
|
||
const contentLines: string[] = [];
|
||
for (let i = startIdx; i < endIdx; i++) {
|
||
const line = lines[i]!;
|
||
if (line.length < 3) continue;
|
||
if (/^[-–—\s]*[A-Za-z]?[-–—]?\s*\d+[-–—\s]*$/.test(line)) continue;
|
||
if (/^page\s+\d+$/i.test(line)) continue;
|
||
if (/^table of conten\s*t?s?[\s\/]*$/i.test(line)) continue;
|
||
if (/^part\s+[iv]+$/i.test(line) && line.length < 15) continue;
|
||
if (/^\(?\s*back\s+to\s+(index|top|toc)\s*\)?$/i.test(line)) continue;
|
||
if (/^index$/i.test(line)) continue;
|
||
if (/form\s+10-[kq]/i.test(line) && line.length < 120) continue;
|
||
if (/^\d{4}\s+(form|annual)/i.test(line) && line.length < 40) continue;
|
||
if (line.length < 50 && /^[A-Z#@\s\d]+$/.test(line) && !/CYBER|SECURITY|RISK|BOARD|INCIDENT/i.test(line)) continue;
|
||
if (line.length < 120 && SUBSIDIARY_HEADER.test(line)) continue;
|
||
if (line.length < 80 && /\|/.test(line) && !/cyber|security|incident|threat/i.test(line)) continue;
|
||
if (/^\d*\s*table\s+of\s+contents\s+/i.test(line)) {
|
||
const stripped = line.replace(/^\d*\s*table\s+of\s+contents\s+/i, "").trim();
|
||
if (stripped.length >= 5 && /[.!?]/.test(stripped)) { contentLines.push(stripped); }
|
||
continue;
|
||
}
|
||
contentLines.push(line);
|
||
}
|
||
|
||
// Second pass: merge continuation lines, skip artifacts between broken sentences
|
||
const merged: string[] = [];
|
||
for (let ci = 0; ci < contentLines.length; ci++) {
|
||
const line = contentLines[ci]!;
|
||
const prev = merged.length > 0 ? merged[merged.length - 1]! : "";
|
||
const prevIsBroken = merged.length > 0 && !/[.!?:;\")\u201d]\s*$/.test(prev) && prev.length > 15;
|
||
|
||
if (prevIsBroken && line.length < 80 && ci + 1 < contentLines.length && /^[a-z]/.test(contentLines[ci + 1]!)) {
|
||
const hasContentWords = /\b(we|our|the|is|are|has|have|its|this|each|all|any|not|may|can|will|such|including|cybersecurity|security|risk|board|management|incident|threat|assess|monitor|oversee|protect|comply)\b/i.test(line);
|
||
if (!hasContentWords) {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
const lcContinuation = /^[a-z]/.test(line) && prevIsBroken;
|
||
const phraseContinuation = merged.length > 0 && CONTINUATION_RE.test(line) && prev.length > 15;
|
||
const brokenMerge = prevIsBroken && line.length > 40 &&
|
||
!/^\s*(item|part|signatures?|exhibit|properties|legal\s+proceedings)/i.test(line);
|
||
if (lcContinuation || phraseContinuation || brokenMerge) {
|
||
merged[merged.length - 1] = prev + " " + line;
|
||
} else {
|
||
merged.push(line);
|
||
}
|
||
}
|
||
|
||
const blocks: string[] = [];
|
||
let words = 0;
|
||
for (const line of merged) {
|
||
if (line.length < 5) continue;
|
||
blocks.push(line);
|
||
words += line.split(/\s+/).length;
|
||
if (blocks.length >= MAX_BLOCKS || words >= MAX_WORDS) break;
|
||
}
|
||
|
||
return blocks.length > 0 ? blocks.join("\n\n") : null;
|
||
}
|
||
|
||
// ─── Main ───
|
||
|
||
const start = Date.now();
|
||
|
||
process.stderr.write(" Loading accession metadata...\n");
|
||
const accMeta: Record<string, { companyName: string; cik: string; ticker: string; filingDate: string; fiscalYear: number }> =
|
||
JSON.parse(readFileSync(ACCESSION_META_PATH, "utf-8"));
|
||
process.stderr.write(` ${Object.keys(accMeta).length} entries\n`);
|
||
|
||
const htmlFiles = readdirSync(HTML_CACHE_DIR).filter((f) => f.endsWith(".html")).sort();
|
||
process.stderr.write(` ${htmlFiles.length} HTML files\n\n`);
|
||
|
||
const output: string[] = [];
|
||
let processed = 0;
|
||
let skipped = 0;
|
||
|
||
for (let i = 0; i < htmlFiles.length; i++) {
|
||
const file = htmlFiles[i]!;
|
||
const accession = file.replace(".html", "");
|
||
const meta = accMeta[accession];
|
||
|
||
if (!meta) { skipped++; continue; }
|
||
|
||
const html = readFileSync(`${HTML_CACHE_DIR}/${file}`, "utf-8");
|
||
const section = extractItem1C(stripHtml(html));
|
||
if (!section) { skipped++; continue; }
|
||
|
||
const filingMeta: FilingMeta = {
|
||
companyName: meta.companyName,
|
||
cik: meta.cik,
|
||
ticker: meta.ticker,
|
||
filingType: "10-K",
|
||
filingDate: meta.filingDate,
|
||
fiscalYear: meta.fiscalYear,
|
||
accessionNumber: accession,
|
||
secItem: "Item 1C",
|
||
};
|
||
|
||
const paragraphs = segmentParagraphs(section, filingMeta);
|
||
for (const p of paragraphs) output.push(JSON.stringify(p));
|
||
processed++;
|
||
|
||
if ((i + 1) % 500 === 0) {
|
||
const rate = ((processed / ((Date.now() - start) / 1000)) * 60).toFixed(0);
|
||
process.stderr.write(
|
||
`\x1b[2K\r ${i + 1}/${htmlFiles.length} │ ${processed} parsed │ ${output.length} paras │ ${skipped} skip │ ${rate}/min`,
|
||
);
|
||
}
|
||
}
|
||
|
||
process.stderr.write("\n Writing...\n");
|
||
writeFileSync(OUTPUT_PATH, output.join("\n") + "\n");
|
||
|
||
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
|
||
process.stderr.write(
|
||
`\n ✓ Done in ${elapsed}s: ${processed} filings → ${output.length} paragraphs (${skipped} skipped)\n`,
|
||
);
|