SEC-cyBERT/ts/scripts/patch-orphan-words.ts

/**
 * Expanded orphan word patch: recover dropped leading words for all
 * paragraphs that start with lowercase (non-list patterns).
 *
 * For each candidate paragraph:
 * 1. Read the source HTML for the filing
 * 2. Strip HTML to plain text
 * 3. Find the paragraph text in the stripped output
 * 4. Look backwards to find the orphaned word on its own line
 * 5. Validate: orphaned word must be short (1-3 words), start with uppercase
 * 6. Output patch record
 *
 * Usage: bun run ts/scripts/patch-orphan-words.ts
 * Input:  data/paragraphs/paragraphs-clean.jsonl
 * Output: data/paragraphs/patches/orphan-word-patches.jsonl
 */
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
import { stripHtml } from "../src/extract/html-cleaner.ts";

const PARAGRAPHS_PATH = "data/paragraphs/paragraphs-clean.jsonl";
const HTML_DIR = "data/raw/html";
const OUTPUT_PATH = "data/paragraphs/patches/orphan-word-patches.jsonl";

// List patterns to exclude (legitimate lowercase starts)
const LIST_PATTERNS = /^(and |or |including |such as |as well as |along with |that |which |where |whether |as described |for example|for more |pursuant to |in addition )/i;

interface Paragraph {
  id: string;
  text: string;
  textHash: string;
  wordCount: number;
  paragraphIndex: number;
  filing: {
    accessionNumber: string;
    companyName: string;
    [key: string]: unknown;
  };
}

interface PatchRecord {
  id: string;
  accession: string;
  paragraphIndex: number;
  orphanWord: string;
  originalStart: string;
  patchedStart: string;
  method: string;
}

// Cache stripped HTML per filing
const strippedCache = new Map<string, string>();

function getStrippedHtml(accession: string): string | null {
  if (strippedCache.has(accession)) return strippedCache.get(accession)!;

  const htmlPath = `${HTML_DIR}/${accession}.html`;
  if (!existsSync(htmlPath)) return null;

  const html = readFileSync(htmlPath, "utf-8");
  const stripped = stripHtml(html);
  strippedCache.set(accession, stripped);
  return stripped;
}

function findOrphanWord(stripped: string, paragraphText: string): string | null {
  // Use first 80 chars to search — avoids paragraph-end differences
  const searchText = paragraphText.substring(0, Math.min(80, paragraphText.length));
  const idx = stripped.indexOf(searchText);
  if (idx === -1) return null;

  // Look backwards to find the orphaned word
  const before = stripped.substring(Math.max(0, idx - 200), idx);
  const lines = before.split("\n");
  const candidates = lines.filter((l) => l.trim().length > 0);
  if (candidates.length === 0) return null;

  const lastLine = candidates[candidates.length - 1]!.trim();

  // Validate: short (1-3 words), starts with uppercase
  const words = lastLine.split(/\s+/);
  if (words.length > 3 || words.length === 0) return null;
  if (!/^[A-Z]/.test(words[0]!)) return null;

  // Reject all-caps headings (>15 chars)
  if (lastLine === lastLine.toUpperCase() && lastLine.length > 15) return null;

  // Reject section/item references and page artifacts
  if (/^(item|part|section)\s/i.test(lastLine)) return null;
  if (/^\d+[\.\)]/.test(lastLine)) return null;
  if (/^table of contents$/i.test(lastLine)) return null;

  return lastLine;
}

// ─── Main ───

const start = Date.now();
mkdirSync("data/paragraphs/patches", { recursive: true });

process.stderr.write("  Loading paragraphs...\n");
const paragraphs: Paragraph[] = [];
for (const line of readFileSync(PARAGRAPHS_PATH, "utf-8").split("\n")) {
  if (line.trim()) paragraphs.push(JSON.parse(line));
}
process.stderr.write(`  ${paragraphs.length} paragraphs loaded\n`);

// Find candidates
const candidateParas = paragraphs.filter((p) => {
  if (!p.text || p.text.length === 0) return false;
  if (!/^[a-z]/.test(p.text)) return false;
  if (LIST_PATTERNS.test(p.text)) return false;
  return true;
});
process.stderr.write(`  ${candidateParas.length} orphan word candidates\n\n`);

// Process
const patches: PatchRecord[] = [];
let notFound = 0;
let noOrphan = 0;
let lastAcc = "";

for (let i = 0; i < candidateParas.length; i++) {
  const p = candidateParas[i]!;
  const acc = p.filing.accessionNumber;

  if (acc !== lastAcc) {
    if (strippedCache.size > 20) strippedCache.clear();
    lastAcc = acc;
  }

  const stripped = getStrippedHtml(acc);
  if (!stripped) { notFound++; continue; }

  const orphan = findOrphanWord(stripped, p.text);
  if (!orphan) { noOrphan++; continue; }

  patches.push({
    id: p.id,
    accession: acc,
    paragraphIndex: p.paragraphIndex,
    orphanWord: orphan,
    originalStart: p.text.substring(0, 60),
    patchedStart: orphan + " " + p.text.substring(0, 60),
    method: "html-lookback",
  });

  if ((i + 1) % 200 === 0) {
    process.stderr.write(
      `\x1b[2K\r  ${i + 1}/${candidateParas.length} | ${patches.length} patched | ${noOrphan} no orphan | ${notFound} no HTML`,
    );
  }
}

writeFileSync(OUTPUT_PATH, patches.map((p) => JSON.stringify(p)).join("\n") + "\n");

const elapsed = ((Date.now() - start) / 1000).toFixed(1);
process.stderr.write(
  `\n\n  Done in ${elapsed}s\n` +
    `  ${candidateParas.length} candidates → ${patches.length} patches found\n` +
    `  ${noOrphan} candidates: no orphan word found in HTML\n` +
    `  ${notFound} candidates: HTML file not found\n` +
    `  Output: ${OUTPUT_PATH}\n`,
);

// Word frequency summary
const wordCounts = new Map<string, number>();
for (const p of patches) {
  wordCounts.set(p.orphanWord, (wordCounts.get(p.orphanWord) ?? 0) + 1);
}
const sorted = [...wordCounts.entries()].sort((a, b) => b[1] - a[1]);
process.stderr.write("\n  Top orphan words:\n");
for (const [word, count] of sorted.slice(0, 15)) {
  process.stderr.write(`    ${word}: ${count}\n`);
}