/** * Fast reparse of cached HTML files. No network, no cheerio, no large lookups. * Reads pre-built accession→metadata map and uses regex for HTML stripping. */ import { readdirSync, readFileSync, writeFileSync } from "node:fs"; import { segmentParagraphs } from "./segment.ts"; import { stripHtml } from "./html-cleaner.ts"; import type { FilingMeta } from "@sec-cybert/schemas/paragraph.ts"; const HTML_CACHE_DIR = "../data/raw/html"; const OUTPUT_PATH = "../data/paragraphs/paragraphs.jsonl"; const ACCESSION_META_PATH = "../data/bulk/accession-meta.json"; // ─── Item 1C extraction (regex on stripped text) ─── const ITEM_1C = /^\s*(\u2022\s*)?item\s*1c[\.\s\u00a0—–:-]/i; const NEXT_ITEM = /^\s*items?\s*(\d+[a-z]?\.?\d*)/i; const SECTION_END = /^\s*(signatures?|part\s*(ii|2|iii|3|iv|4)|exhibit\s*index|financial\s+statements|management['']?s?\s+financial\s+discussion)/i; // Must be ALL-CAPS and short (< 120 chars) to avoid matching body text const SUBSIDIARY_HEADER = /^[A-Z][A-Z\s,.'&-]{5,}(?:LLC|INC|CORP|COMPANY|L\.?P\.?)\b.*\bAND\s+SUBSIDIARIES\b/; const POST_1C_SECTION = /^\s*(properties|legal\s+proceedings|mine\s+safety|market\s+for\s+registrant|selected\s+financial|management.s\s+discussion|equity\s+compensation\s+plan|stock\s+performance)/i; const SHORT_SECTION_HEADING = /^\s*(risk\s+factors|controls\s+and\s+procedures|unresolved\s+staff\s+comments|glossary[\s\w]*|employees?\s+and\s+(consultants|human)|subsidiaries|executive\s+officers)\s*$/i; const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i; const MAX_BLOCKS = 50; const MAX_WORDS = 15000; function extractItem1C(text: string): string | null { // Rejoin broken headings: "ITEM\n2. PROPERTIES" → "ITEM 2. PROPERTIES" const rawLines = text.split("\n").map((l) => l.replace(/\s+/g, " ").trim()).filter((l) => l.length > 0); const lines: string[] = []; for (let i = 0; i < rawLines.length; i++) { const line = rawLines[i]!; if (/^\s*(item|ITEM|Item)\s*$/i.test(line) && i + 1 < rawLines.length && /^\s*\d/.test(rawLines[i + 1]!)) { lines.push(line + " " + rawLines[i + 1]!); i++; } else { lines.push(line); } } let startIdx = -1; for (let i = 0; i < lines.length; i++) { if (ITEM_1C.test(lines[i]!) && lines[i]!.length < 300) startIdx = i + 1; } if (startIdx === -1) return null; let endIdx = lines.length; for (let i = startIdx; i < lines.length; i++) { const line = lines[i]!; if (line.length > 300) continue; if (SECTION_END.test(line)) { endIdx = i; break; } if (POST_1C_SECTION.test(line) && line.length < 150) { endIdx = i; break; } if (SHORT_SECTION_HEADING.test(line)) { endIdx = i; break; } const m = NEXT_ITEM.exec(line); if (m && m[1]!.toLowerCase() !== "1c" && line.length < 120) { endIdx = i; break; } } // First pass: strip page artifacts (headers, footers, page numbers, running titles) const contentLines: string[] = []; for (let i = startIdx; i < endIdx; i++) { const line = lines[i]!; if (line.length < 3) continue; if (/^[-–—\s]*[A-Za-z]?[-–—]?\s*\d+[-–—\s]*$/.test(line)) continue; if (/^page\s+\d+$/i.test(line)) continue; if (/^table of conten\s*t?s?[\s\/]*$/i.test(line)) continue; if (/^part\s+[iv]+$/i.test(line) && line.length < 15) continue; if (/^\(?\s*back\s+to\s+(index|top|toc)\s*\)?$/i.test(line)) continue; if (/^index$/i.test(line)) continue; if (/form\s+10-[kq]/i.test(line) && line.length < 120) continue; if (/^\d{4}\s+(form|annual)/i.test(line) && line.length < 40) continue; if (line.length < 50 && /^[A-Z#@\s\d]+$/.test(line) && !/CYBER|SECURITY|RISK|BOARD|INCIDENT/i.test(line)) continue; if (line.length < 120 && SUBSIDIARY_HEADER.test(line)) continue; if (line.length < 80 && /\|/.test(line) && !/cyber|security|incident|threat/i.test(line)) continue; if (/^\d*\s*table\s+of\s+contents\s+/i.test(line)) { const stripped = line.replace(/^\d*\s*table\s+of\s+contents\s+/i, "").trim(); if (stripped.length >= 5 && /[.!?]/.test(stripped)) { contentLines.push(stripped); } continue; } contentLines.push(line); } // Second pass: merge continuation lines, skip artifacts between broken sentences const merged: string[] = []; for (let ci = 0; ci < contentLines.length; ci++) { const line = contentLines[ci]!; const prev = merged.length > 0 ? merged[merged.length - 1]! : ""; const prevIsBroken = merged.length > 0 && !/[.!?:;\")\u201d]\s*$/.test(prev) && prev.length > 15; if (prevIsBroken && line.length < 80 && ci + 1 < contentLines.length && /^[a-z]/.test(contentLines[ci + 1]!)) { const hasContentWords = /\b(we|our|the|is|are|has|have|its|this|each|all|any|not|may|can|will|such|including|cybersecurity|security|risk|board|management|incident|threat|assess|monitor|oversee|protect|comply)\b/i.test(line); if (!hasContentWords) { continue; } } const lcContinuation = /^[a-z]/.test(line) && prevIsBroken; const phraseContinuation = merged.length > 0 && CONTINUATION_RE.test(line) && prev.length > 15; const brokenMerge = prevIsBroken && line.length > 40 && !/^\s*(item|part|signatures?|exhibit|properties|legal\s+proceedings)/i.test(line); if (lcContinuation || phraseContinuation || brokenMerge) { merged[merged.length - 1] = prev + " " + line; } else { merged.push(line); } } const blocks: string[] = []; let words = 0; for (const line of merged) { if (line.length < 5) continue; blocks.push(line); words += line.split(/\s+/).length; if (blocks.length >= MAX_BLOCKS || words >= MAX_WORDS) break; } return blocks.length > 0 ? blocks.join("\n\n") : null; } // ─── Main ─── const start = Date.now(); process.stderr.write(" Loading accession metadata...\n"); const accMeta: Record = JSON.parse(readFileSync(ACCESSION_META_PATH, "utf-8")); process.stderr.write(` ${Object.keys(accMeta).length} entries\n`); const htmlFiles = readdirSync(HTML_CACHE_DIR).filter((f) => f.endsWith(".html")).sort(); process.stderr.write(` ${htmlFiles.length} HTML files\n\n`); const output: string[] = []; let processed = 0; let skipped = 0; for (let i = 0; i < htmlFiles.length; i++) { const file = htmlFiles[i]!; const accession = file.replace(".html", ""); const meta = accMeta[accession]; if (!meta) { skipped++; continue; } const html = readFileSync(`${HTML_CACHE_DIR}/${file}`, "utf-8"); const section = extractItem1C(stripHtml(html)); if (!section) { skipped++; continue; } const filingMeta: FilingMeta = { companyName: meta.companyName, cik: meta.cik, ticker: meta.ticker, filingType: "10-K", filingDate: meta.filingDate, fiscalYear: meta.fiscalYear, accessionNumber: accession, secItem: "Item 1C", }; const paragraphs = segmentParagraphs(section, filingMeta); for (const p of paragraphs) output.push(JSON.stringify(p)); processed++; if ((i + 1) % 500 === 0) { const rate = ((processed / ((Date.now() - start) / 1000)) * 60).toFixed(0); process.stderr.write( `\x1b[2K\r ${i + 1}/${htmlFiles.length} │ ${processed} parsed │ ${output.length} paras │ ${skipped} skip │ ${rate}/min`, ); } } process.stderr.write("\n Writing...\n"); writeFileSync(OUTPUT_PATH, output.join("\n") + "\n"); const elapsed = ((Date.now() - start) / 1000).toFixed(1); process.stderr.write( `\n ✓ Done in ${elapsed}s: ${processed} filings → ${output.length} paragraphs (${skipped} skipped)\n`, );