SEC-cyBERT/ts/scripts/patch-orphan-words.ts
2026-03-29 20:33:39 -04:00

175 lines
5.6 KiB
TypeScript

/**
* Expanded orphan word patch: recover dropped leading words for all
* paragraphs that start with lowercase (non-list patterns).
*
* For each candidate paragraph:
* 1. Read the source HTML for the filing
* 2. Strip HTML to plain text
* 3. Find the paragraph text in the stripped output
* 4. Look backwards to find the orphaned word on its own line
* 5. Validate: orphaned word must be short (1-3 words), start with uppercase
* 6. Output patch record
*
* Usage: bun run ts/scripts/patch-orphan-words.ts
* Input: data/paragraphs/paragraphs-clean.jsonl
* Output: data/paragraphs/patches/orphan-word-patches.jsonl
*/
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
import { stripHtml } from "../src/extract/html-cleaner.ts";
const PARAGRAPHS_PATH = "data/paragraphs/paragraphs-clean.jsonl";
const HTML_DIR = "data/raw/html";
const OUTPUT_PATH = "data/paragraphs/patches/orphan-word-patches.jsonl";
// List patterns to exclude (legitimate lowercase starts)
const LIST_PATTERNS = /^(and |or |including |such as |as well as |along with |that |which |where |whether |as described |for example|for more |pursuant to |in addition )/i;
interface Paragraph {
id: string;
text: string;
textHash: string;
wordCount: number;
paragraphIndex: number;
filing: {
accessionNumber: string;
companyName: string;
[key: string]: unknown;
};
}
interface PatchRecord {
id: string;
accession: string;
paragraphIndex: number;
orphanWord: string;
originalStart: string;
patchedStart: string;
method: string;
}
// Cache stripped HTML per filing
const strippedCache = new Map<string, string>();
function getStrippedHtml(accession: string): string | null {
if (strippedCache.has(accession)) return strippedCache.get(accession)!;
const htmlPath = `${HTML_DIR}/${accession}.html`;
if (!existsSync(htmlPath)) return null;
const html = readFileSync(htmlPath, "utf-8");
const stripped = stripHtml(html);
strippedCache.set(accession, stripped);
return stripped;
}
function findOrphanWord(stripped: string, paragraphText: string): string | null {
// Use first 80 chars to search — avoids paragraph-end differences
const searchText = paragraphText.substring(0, Math.min(80, paragraphText.length));
const idx = stripped.indexOf(searchText);
if (idx === -1) return null;
// Look backwards to find the orphaned word
const before = stripped.substring(Math.max(0, idx - 200), idx);
const lines = before.split("\n");
const candidates = lines.filter((l) => l.trim().length > 0);
if (candidates.length === 0) return null;
const lastLine = candidates[candidates.length - 1]!.trim();
// Validate: short (1-3 words), starts with uppercase
const words = lastLine.split(/\s+/);
if (words.length > 3 || words.length === 0) return null;
if (!/^[A-Z]/.test(words[0]!)) return null;
// Reject all-caps headings (>15 chars)
if (lastLine === lastLine.toUpperCase() && lastLine.length > 15) return null;
// Reject section/item references and page artifacts
if (/^(item|part|section)\s/i.test(lastLine)) return null;
if (/^\d+[\.\)]/.test(lastLine)) return null;
if (/^table of contents$/i.test(lastLine)) return null;
return lastLine;
}
// ─── Main ───
const start = Date.now();
mkdirSync("data/paragraphs/patches", { recursive: true });
process.stderr.write(" Loading paragraphs...\n");
const paragraphs: Paragraph[] = [];
for (const line of readFileSync(PARAGRAPHS_PATH, "utf-8").split("\n")) {
if (line.trim()) paragraphs.push(JSON.parse(line));
}
process.stderr.write(` ${paragraphs.length} paragraphs loaded\n`);
// Find candidates
const candidateParas = paragraphs.filter((p) => {
if (!p.text || p.text.length === 0) return false;
if (!/^[a-z]/.test(p.text)) return false;
if (LIST_PATTERNS.test(p.text)) return false;
return true;
});
process.stderr.write(` ${candidateParas.length} orphan word candidates\n\n`);
// Process
const patches: PatchRecord[] = [];
let notFound = 0;
let noOrphan = 0;
let lastAcc = "";
for (let i = 0; i < candidateParas.length; i++) {
const p = candidateParas[i]!;
const acc = p.filing.accessionNumber;
if (acc !== lastAcc) {
if (strippedCache.size > 20) strippedCache.clear();
lastAcc = acc;
}
const stripped = getStrippedHtml(acc);
if (!stripped) { notFound++; continue; }
const orphan = findOrphanWord(stripped, p.text);
if (!orphan) { noOrphan++; continue; }
patches.push({
id: p.id,
accession: acc,
paragraphIndex: p.paragraphIndex,
orphanWord: orphan,
originalStart: p.text.substring(0, 60),
patchedStart: orphan + " " + p.text.substring(0, 60),
method: "html-lookback",
});
if ((i + 1) % 200 === 0) {
process.stderr.write(
`\x1b[2K\r ${i + 1}/${candidateParas.length} | ${patches.length} patched | ${noOrphan} no orphan | ${notFound} no HTML`,
);
}
}
writeFileSync(OUTPUT_PATH, patches.map((p) => JSON.stringify(p)).join("\n") + "\n");
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
process.stderr.write(
`\n\n Done in ${elapsed}s\n` +
` ${candidateParas.length} candidates → ${patches.length} patches found\n` +
` ${noOrphan} candidates: no orphan word found in HTML\n` +
` ${notFound} candidates: HTML file not found\n` +
` Output: ${OUTPUT_PATH}\n`,
);
// Word frequency summary
const wordCounts = new Map<string, number>();
for (const p of patches) {
wordCounts.set(p.orphanWord, (wordCounts.get(p.orphanWord) ?? 0) + 1);
}
const sorted = [...wordCounts.entries()].sort((a, b) => b[1] - a[1]);
process.stderr.write("\n Top orphan words:\n");
for (const [word, count] of sorted.slice(0, 15)) {
process.stderr.write(` ${word}: ${count}\n`);
}