SEC-cyBERT/ts/src/extract/segment.ts
2026-03-28 20:39:36 -04:00

209 lines
6.6 KiB
TypeScript

import { v4 as uuidv4 } from "uuid";
import { createHash } from "node:crypto";
import type { Paragraph, FilingMeta } from "../schemas/paragraph.ts";
const MIN_WORDS = 20;
const MAX_WORDS = 500;
/** Normalize text for hashing: lowercase, collapse whitespace, strip punctuation variants. */
function normalizeForHash(text: string): string {
return text
.toLowerCase()
.replace(/\s+/g, " ")
.replace(/['']/g, "'")
.replace(/[""]/g, '"')
.replace(/[—–]/g, "-")
.trim();
}
/** SHA-256 hash of normalized text for cross-filing dedup tracking. */
function textHash(text: string): string {
return createHash("sha256").update(normalizeForHash(text)).digest("hex").slice(0, 16);
}
/** Count words in a string. */
function wordCount(text: string): number {
return text.split(/\s+/).filter((w) => w.length > 0).length;
}
/** Split a long text block at sentence boundaries to stay under MAX_WORDS. */
function splitAtSentences(text: string): string[] {
// Split on sentence-ending punctuation followed by whitespace
const sentences = text.match(/[^.!?]+[.!?]+[\s"]*/g) ?? [text];
const chunks: string[] = [];
let current = "";
for (const sentence of sentences) {
const combined = current + sentence;
if (wordCount(combined) > MAX_WORDS && current.length > 0) {
chunks.push(current.trim());
current = sentence;
} else {
current = combined;
}
}
if (current.trim().length > 0) chunks.push(current.trim());
return chunks;
}
const BULLET_RE = /^[•\-·▪►■◦‣⁃–—]\s/;
const CONTAINS_BULLET = /[•·▪►■◦‣⁃–—]\s/;
const ENDS_WITH_COLON = /:\s*$/;
const TERMINAL_PUNCT = /[.!?;")\u201d]\s*$/;
const STARTS_LOWERCASE = /^[a-z]/;
const LEADING_PUNCT = /^[":.,;]\s*/;
// Continuation phrases that indicate a block is mid-sentence regardless of previous punctuation
const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i;
/**
* Pre-merge raw blocks: join bullet lists with their intro sentence,
* and rejoin lowercase continuations split across block boundaries.
*/
function mergeBlocks(rawBlocks: string[]): string[] {
const cleaned: string[] = [];
for (const raw of rawBlocks) {
const text = raw.replace(/\s+/g, " ").trim();
if (text.length > 0) cleaned.push(text);
}
const merged: string[] = [];
for (const block of cleaned) {
if (merged.length === 0) {
merged.push(block);
continue;
}
const prev = merged[merged.length - 1]!;
const prevWc = wordCount(prev);
const combinedWc = prevWc + wordCount(block);
// Don't merge if result would exceed MAX_WORDS
if (combinedWc > MAX_WORDS) {
merged.push(block);
continue;
}
const isBullet = BULLET_RE.test(block);
const prevContainsBullet = CONTAINS_BULLET.test(prev);
const prevEndsColon = ENDS_WITH_COLON.test(prev);
const prevNoTerminal = !TERMINAL_PUNCT.test(prev);
// Merge bullet with preceding intro (ends with ":") or preceding bullet list
if (isBullet && (prevEndsColon || prevContainsBullet)) {
merged[merged.length - 1] = prev + " " + block;
continue;
}
// Merge lowercase continuation when previous lacks terminal punctuation
if (STARTS_LOWERCASE.test(block) && prevNoTerminal && !isBullet && prevWc > 5) {
merged[merged.length - 1] = prev + " " + block;
continue;
}
// Merge continuation phrases even when previous has terminal punctuation
// (handles sentences split across HTML block elements or page breaks)
// No prevWc guard — short headings between content and continuation shouldn't block merge
if (CONTINUATION_RE.test(block) && !isBullet) {
merged[merged.length - 1] = prev + " " + block;
continue;
}
merged.push(block);
}
return merged;
}
/**
* Segment extracted section text into paragraphs suitable for labeling.
*
* - Splits on double newlines (natural paragraph breaks from HTML extraction)
* - Merges bullet lists with their intro sentence
* - Merges lowercase continuations with preceding block
* - Strips leading orphaned punctuation
* - Deduplicates identical text within a filing
* - Filters out paragraphs under MIN_WORDS (headers, whitespace artifacts)
* - Splits paragraphs over MAX_WORDS at sentence boundaries
* - Assigns UUIDs and paragraph indices
*/
export function segmentParagraphs(
sectionText: string,
filingMeta: FilingMeta,
): Paragraph[] {
const rawBlocks = sectionText.split(/\n\n+/);
const blocks = mergeBlocks(rawBlocks);
const paragraphs: Paragraph[] = [];
const seen = new Set<string>();
let idx = 0;
function addParagraph(text: string) {
// Strip leading orphaned punctuation (": text" → "text")
const stripped = text.replace(LEADING_PUNCT, "");
if (stripped.length === 0) return;
const hash = textHash(stripped);
if (seen.has(hash)) return;
seen.add(hash);
const wc = wordCount(stripped);
if (wc < MIN_WORDS) return;
if (wc > MAX_WORDS) {
for (const chunk of splitAtSentences(stripped)) {
const chunkWc = wordCount(chunk);
if (chunkWc < MIN_WORDS) continue;
const chunkHash = textHash(chunk);
if (seen.has(chunkHash)) continue;
seen.add(chunkHash);
paragraphs.push({
id: uuidv4(),
text: chunk,
textHash: chunkHash,
wordCount: chunkWc,
paragraphIndex: idx++,
filing: filingMeta,
});
}
} else {
paragraphs.push({
id: uuidv4(),
text: stripped,
textHash: hash,
wordCount: wc,
paragraphIndex: idx++,
filing: filingMeta,
});
}
}
for (const block of blocks) {
const stripped = block.replace(LEADING_PUNCT, "");
if (stripped.length === 0) continue;
const wc = wordCount(stripped);
// Short blocks: append to previous paragraph instead of dropping,
// but only if it completes a sentence or previous was already broken
if (wc < MIN_WORDS && paragraphs.length > 0) {
const prev = paragraphs[paragraphs.length - 1]!;
const fragmentCompletes = TERMINAL_PUNCT.test(stripped);
const prevAlreadyBroken = !TERMINAL_PUNCT.test(prev.text);
if (fragmentCompletes || prevAlreadyBroken) {
const combined = prev.text + " " + stripped;
const combinedWc = wordCount(combined);
if (combinedWc <= MAX_WORDS) {
prev.text = combined;
prev.textHash = textHash(combined);
prev.wordCount = combinedWc;
continue;
}
}
}
addParagraph(stripped);
}
return paragraphs;
}