175 lines
5.6 KiB
TypeScript
175 lines
5.6 KiB
TypeScript
/**
|
|
* Expanded orphan word patch: recover dropped leading words for all
|
|
* paragraphs that start with lowercase (non-list patterns).
|
|
*
|
|
* For each candidate paragraph:
|
|
* 1. Read the source HTML for the filing
|
|
* 2. Strip HTML to plain text
|
|
* 3. Find the paragraph text in the stripped output
|
|
* 4. Look backwards to find the orphaned word on its own line
|
|
* 5. Validate: orphaned word must be short (1-3 words), start with uppercase
|
|
* 6. Output patch record
|
|
*
|
|
* Usage: bun run ts/scripts/patch-orphan-words.ts
|
|
* Input: data/paragraphs/paragraphs-clean.jsonl
|
|
* Output: data/paragraphs/patches/orphan-word-patches.jsonl
|
|
*/
|
|
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
|
|
import { stripHtml } from "../src/extract/html-cleaner.ts";
|
|
|
|
const PARAGRAPHS_PATH = "data/paragraphs/paragraphs-clean.jsonl";
|
|
const HTML_DIR = "data/raw/html";
|
|
const OUTPUT_PATH = "data/paragraphs/patches/orphan-word-patches.jsonl";
|
|
|
|
// List patterns to exclude (legitimate lowercase starts)
|
|
const LIST_PATTERNS = /^(and |or |including |such as |as well as |along with |that |which |where |whether |as described |for example|for more |pursuant to |in addition )/i;
|
|
|
|
interface Paragraph {
|
|
id: string;
|
|
text: string;
|
|
textHash: string;
|
|
wordCount: number;
|
|
paragraphIndex: number;
|
|
filing: {
|
|
accessionNumber: string;
|
|
companyName: string;
|
|
[key: string]: unknown;
|
|
};
|
|
}
|
|
|
|
interface PatchRecord {
|
|
id: string;
|
|
accession: string;
|
|
paragraphIndex: number;
|
|
orphanWord: string;
|
|
originalStart: string;
|
|
patchedStart: string;
|
|
method: string;
|
|
}
|
|
|
|
// Cache stripped HTML per filing
|
|
const strippedCache = new Map<string, string>();
|
|
|
|
function getStrippedHtml(accession: string): string | null {
|
|
if (strippedCache.has(accession)) return strippedCache.get(accession)!;
|
|
|
|
const htmlPath = `${HTML_DIR}/${accession}.html`;
|
|
if (!existsSync(htmlPath)) return null;
|
|
|
|
const html = readFileSync(htmlPath, "utf-8");
|
|
const stripped = stripHtml(html);
|
|
strippedCache.set(accession, stripped);
|
|
return stripped;
|
|
}
|
|
|
|
function findOrphanWord(stripped: string, paragraphText: string): string | null {
|
|
// Use first 80 chars to search — avoids paragraph-end differences
|
|
const searchText = paragraphText.substring(0, Math.min(80, paragraphText.length));
|
|
const idx = stripped.indexOf(searchText);
|
|
if (idx === -1) return null;
|
|
|
|
// Look backwards to find the orphaned word
|
|
const before = stripped.substring(Math.max(0, idx - 200), idx);
|
|
const lines = before.split("\n");
|
|
const candidates = lines.filter((l) => l.trim().length > 0);
|
|
if (candidates.length === 0) return null;
|
|
|
|
const lastLine = candidates[candidates.length - 1]!.trim();
|
|
|
|
// Validate: short (1-3 words), starts with uppercase
|
|
const words = lastLine.split(/\s+/);
|
|
if (words.length > 3 || words.length === 0) return null;
|
|
if (!/^[A-Z]/.test(words[0]!)) return null;
|
|
|
|
// Reject all-caps headings (>15 chars)
|
|
if (lastLine === lastLine.toUpperCase() && lastLine.length > 15) return null;
|
|
|
|
// Reject section/item references and page artifacts
|
|
if (/^(item|part|section)\s/i.test(lastLine)) return null;
|
|
if (/^\d+[\.\)]/.test(lastLine)) return null;
|
|
if (/^table of contents$/i.test(lastLine)) return null;
|
|
|
|
return lastLine;
|
|
}
|
|
|
|
// ─── Main ───
|
|
|
|
const start = Date.now();
|
|
mkdirSync("data/paragraphs/patches", { recursive: true });
|
|
|
|
process.stderr.write(" Loading paragraphs...\n");
|
|
const paragraphs: Paragraph[] = [];
|
|
for (const line of readFileSync(PARAGRAPHS_PATH, "utf-8").split("\n")) {
|
|
if (line.trim()) paragraphs.push(JSON.parse(line));
|
|
}
|
|
process.stderr.write(` ${paragraphs.length} paragraphs loaded\n`);
|
|
|
|
// Find candidates
|
|
const candidateParas = paragraphs.filter((p) => {
|
|
if (!p.text || p.text.length === 0) return false;
|
|
if (!/^[a-z]/.test(p.text)) return false;
|
|
if (LIST_PATTERNS.test(p.text)) return false;
|
|
return true;
|
|
});
|
|
process.stderr.write(` ${candidateParas.length} orphan word candidates\n\n`);
|
|
|
|
// Process
|
|
const patches: PatchRecord[] = [];
|
|
let notFound = 0;
|
|
let noOrphan = 0;
|
|
let lastAcc = "";
|
|
|
|
for (let i = 0; i < candidateParas.length; i++) {
|
|
const p = candidateParas[i]!;
|
|
const acc = p.filing.accessionNumber;
|
|
|
|
if (acc !== lastAcc) {
|
|
if (strippedCache.size > 20) strippedCache.clear();
|
|
lastAcc = acc;
|
|
}
|
|
|
|
const stripped = getStrippedHtml(acc);
|
|
if (!stripped) { notFound++; continue; }
|
|
|
|
const orphan = findOrphanWord(stripped, p.text);
|
|
if (!orphan) { noOrphan++; continue; }
|
|
|
|
patches.push({
|
|
id: p.id,
|
|
accession: acc,
|
|
paragraphIndex: p.paragraphIndex,
|
|
orphanWord: orphan,
|
|
originalStart: p.text.substring(0, 60),
|
|
patchedStart: orphan + " " + p.text.substring(0, 60),
|
|
method: "html-lookback",
|
|
});
|
|
|
|
if ((i + 1) % 200 === 0) {
|
|
process.stderr.write(
|
|
`\x1b[2K\r ${i + 1}/${candidateParas.length} | ${patches.length} patched | ${noOrphan} no orphan | ${notFound} no HTML`,
|
|
);
|
|
}
|
|
}
|
|
|
|
writeFileSync(OUTPUT_PATH, patches.map((p) => JSON.stringify(p)).join("\n") + "\n");
|
|
|
|
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
|
|
process.stderr.write(
|
|
`\n\n Done in ${elapsed}s\n` +
|
|
` ${candidateParas.length} candidates → ${patches.length} patches found\n` +
|
|
` ${noOrphan} candidates: no orphan word found in HTML\n` +
|
|
` ${notFound} candidates: HTML file not found\n` +
|
|
` Output: ${OUTPUT_PATH}\n`,
|
|
);
|
|
|
|
// Word frequency summary
|
|
const wordCounts = new Map<string, number>();
|
|
for (const p of patches) {
|
|
wordCounts.set(p.orphanWord, (wordCounts.get(p.orphanWord) ?? 0) + 1);
|
|
}
|
|
const sorted = [...wordCounts.entries()].sort((a, b) => b[1] - a[1]);
|
|
process.stderr.write("\n Top orphan words:\n");
|
|
for (const [word, count] of sorted.slice(0, 15)) {
|
|
process.stderr.write(` ${word}: ${count}\n`);
|
|
}
|