/** * Expanded orphan word patch: recover dropped leading words for all * paragraphs that start with lowercase (non-list patterns). * * For each candidate paragraph: * 1. Read the source HTML for the filing * 2. Strip HTML to plain text * 3. Find the paragraph text in the stripped output * 4. Look backwards to find the orphaned word on its own line * 5. Validate: orphaned word must be short (1-3 words), start with uppercase * 6. Output patch record * * Usage: bun run ts/scripts/patch-orphan-words.ts * Input: data/paragraphs/paragraphs-clean.jsonl * Output: data/paragraphs/patches/orphan-word-patches.jsonl */ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs"; import { stripHtml } from "../src/extract/html-cleaner.ts"; const PARAGRAPHS_PATH = "data/paragraphs/paragraphs-clean.jsonl"; const HTML_DIR = "data/raw/html"; const OUTPUT_PATH = "data/paragraphs/patches/orphan-word-patches.jsonl"; // List patterns to exclude (legitimate lowercase starts) const LIST_PATTERNS = /^(and |or |including |such as |as well as |along with |that |which |where |whether |as described |for example|for more |pursuant to |in addition )/i; interface Paragraph { id: string; text: string; textHash: string; wordCount: number; paragraphIndex: number; filing: { accessionNumber: string; companyName: string; [key: string]: unknown; }; } interface PatchRecord { id: string; accession: string; paragraphIndex: number; orphanWord: string; originalStart: string; patchedStart: string; method: string; } // Cache stripped HTML per filing const strippedCache = new Map(); function getStrippedHtml(accession: string): string | null { if (strippedCache.has(accession)) return strippedCache.get(accession)!; const htmlPath = `${HTML_DIR}/${accession}.html`; if (!existsSync(htmlPath)) return null; const html = readFileSync(htmlPath, "utf-8"); const stripped = stripHtml(html); strippedCache.set(accession, stripped); return stripped; } function findOrphanWord(stripped: string, paragraphText: string): string | null { // Use first 80 chars to search — avoids paragraph-end differences const searchText = paragraphText.substring(0, Math.min(80, paragraphText.length)); const idx = stripped.indexOf(searchText); if (idx === -1) return null; // Look backwards to find the orphaned word const before = stripped.substring(Math.max(0, idx - 200), idx); const lines = before.split("\n"); const candidates = lines.filter((l) => l.trim().length > 0); if (candidates.length === 0) return null; const lastLine = candidates[candidates.length - 1]!.trim(); // Validate: short (1-3 words), starts with uppercase const words = lastLine.split(/\s+/); if (words.length > 3 || words.length === 0) return null; if (!/^[A-Z]/.test(words[0]!)) return null; // Reject all-caps headings (>15 chars) if (lastLine === lastLine.toUpperCase() && lastLine.length > 15) return null; // Reject section/item references and page artifacts if (/^(item|part|section)\s/i.test(lastLine)) return null; if (/^\d+[\.\)]/.test(lastLine)) return null; if (/^table of contents$/i.test(lastLine)) return null; return lastLine; } // ─── Main ─── const start = Date.now(); mkdirSync("data/paragraphs/patches", { recursive: true }); process.stderr.write(" Loading paragraphs...\n"); const paragraphs: Paragraph[] = []; for (const line of readFileSync(PARAGRAPHS_PATH, "utf-8").split("\n")) { if (line.trim()) paragraphs.push(JSON.parse(line)); } process.stderr.write(` ${paragraphs.length} paragraphs loaded\n`); // Find candidates const candidateParas = paragraphs.filter((p) => { if (!p.text || p.text.length === 0) return false; if (!/^[a-z]/.test(p.text)) return false; if (LIST_PATTERNS.test(p.text)) return false; return true; }); process.stderr.write(` ${candidateParas.length} orphan word candidates\n\n`); // Process const patches: PatchRecord[] = []; let notFound = 0; let noOrphan = 0; let lastAcc = ""; for (let i = 0; i < candidateParas.length; i++) { const p = candidateParas[i]!; const acc = p.filing.accessionNumber; if (acc !== lastAcc) { if (strippedCache.size > 20) strippedCache.clear(); lastAcc = acc; } const stripped = getStrippedHtml(acc); if (!stripped) { notFound++; continue; } const orphan = findOrphanWord(stripped, p.text); if (!orphan) { noOrphan++; continue; } patches.push({ id: p.id, accession: acc, paragraphIndex: p.paragraphIndex, orphanWord: orphan, originalStart: p.text.substring(0, 60), patchedStart: orphan + " " + p.text.substring(0, 60), method: "html-lookback", }); if ((i + 1) % 200 === 0) { process.stderr.write( `\x1b[2K\r ${i + 1}/${candidateParas.length} | ${patches.length} patched | ${noOrphan} no orphan | ${notFound} no HTML`, ); } } writeFileSync(OUTPUT_PATH, patches.map((p) => JSON.stringify(p)).join("\n") + "\n"); const elapsed = ((Date.now() - start) / 1000).toFixed(1); process.stderr.write( `\n\n Done in ${elapsed}s\n` + ` ${candidateParas.length} candidates → ${patches.length} patches found\n` + ` ${noOrphan} candidates: no orphan word found in HTML\n` + ` ${notFound} candidates: HTML file not found\n` + ` Output: ${OUTPUT_PATH}\n`, ); // Word frequency summary const wordCounts = new Map(); for (const p of patches) { wordCounts.set(p.orphanWord, (wordCounts.get(p.orphanWord) ?? 0) + 1); } const sorted = [...wordCounts.entries()].sort((a, b) => b[1] - a[1]); process.stderr.write("\n Top orphan words:\n"); for (const [word, count] of sorted.slice(0, 15)) { process.stderr.write(` ${word}: ${count}\n`); }