209 lines
6.6 KiB
TypeScript
209 lines
6.6 KiB
TypeScript
import { v4 as uuidv4 } from "uuid";
|
|
import { createHash } from "node:crypto";
|
|
import type { Paragraph, FilingMeta } from "../schemas/paragraph.ts";
|
|
|
|
const MIN_WORDS = 20;
|
|
const MAX_WORDS = 500;
|
|
|
|
/** Normalize text for hashing: lowercase, collapse whitespace, strip punctuation variants. */
|
|
function normalizeForHash(text: string): string {
|
|
return text
|
|
.toLowerCase()
|
|
.replace(/\s+/g, " ")
|
|
.replace(/['']/g, "'")
|
|
.replace(/[""]/g, '"')
|
|
.replace(/[—–]/g, "-")
|
|
.trim();
|
|
}
|
|
|
|
/** SHA-256 hash of normalized text for cross-filing dedup tracking. */
|
|
function textHash(text: string): string {
|
|
return createHash("sha256").update(normalizeForHash(text)).digest("hex").slice(0, 16);
|
|
}
|
|
|
|
/** Count words in a string. */
|
|
function wordCount(text: string): number {
|
|
return text.split(/\s+/).filter((w) => w.length > 0).length;
|
|
}
|
|
|
|
/** Split a long text block at sentence boundaries to stay under MAX_WORDS. */
|
|
function splitAtSentences(text: string): string[] {
|
|
// Split on sentence-ending punctuation followed by whitespace
|
|
const sentences = text.match(/[^.!?]+[.!?]+[\s"]*/g) ?? [text];
|
|
const chunks: string[] = [];
|
|
let current = "";
|
|
|
|
for (const sentence of sentences) {
|
|
const combined = current + sentence;
|
|
if (wordCount(combined) > MAX_WORDS && current.length > 0) {
|
|
chunks.push(current.trim());
|
|
current = sentence;
|
|
} else {
|
|
current = combined;
|
|
}
|
|
}
|
|
if (current.trim().length > 0) chunks.push(current.trim());
|
|
|
|
return chunks;
|
|
}
|
|
|
|
const BULLET_RE = /^[•\-·▪►■◦‣⁃–—]\s/;
|
|
const CONTAINS_BULLET = /[•·▪►■◦‣⁃–—]\s/;
|
|
const ENDS_WITH_COLON = /:\s*$/;
|
|
const TERMINAL_PUNCT = /[.!?;")\u201d]\s*$/;
|
|
const STARTS_LOWERCASE = /^[a-z]/;
|
|
const LEADING_PUNCT = /^[":.,;]\s*/;
|
|
// Continuation phrases that indicate a block is mid-sentence regardless of previous punctuation
|
|
const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i;
|
|
|
|
/**
|
|
* Pre-merge raw blocks: join bullet lists with their intro sentence,
|
|
* and rejoin lowercase continuations split across block boundaries.
|
|
*/
|
|
function mergeBlocks(rawBlocks: string[]): string[] {
|
|
const cleaned: string[] = [];
|
|
for (const raw of rawBlocks) {
|
|
const text = raw.replace(/\s+/g, " ").trim();
|
|
if (text.length > 0) cleaned.push(text);
|
|
}
|
|
|
|
const merged: string[] = [];
|
|
for (const block of cleaned) {
|
|
if (merged.length === 0) {
|
|
merged.push(block);
|
|
continue;
|
|
}
|
|
|
|
const prev = merged[merged.length - 1]!;
|
|
const prevWc = wordCount(prev);
|
|
const combinedWc = prevWc + wordCount(block);
|
|
|
|
// Don't merge if result would exceed MAX_WORDS
|
|
if (combinedWc > MAX_WORDS) {
|
|
merged.push(block);
|
|
continue;
|
|
}
|
|
|
|
const isBullet = BULLET_RE.test(block);
|
|
const prevContainsBullet = CONTAINS_BULLET.test(prev);
|
|
const prevEndsColon = ENDS_WITH_COLON.test(prev);
|
|
const prevNoTerminal = !TERMINAL_PUNCT.test(prev);
|
|
|
|
// Merge bullet with preceding intro (ends with ":") or preceding bullet list
|
|
if (isBullet && (prevEndsColon || prevContainsBullet)) {
|
|
merged[merged.length - 1] = prev + " " + block;
|
|
continue;
|
|
}
|
|
|
|
// Merge lowercase continuation when previous lacks terminal punctuation
|
|
if (STARTS_LOWERCASE.test(block) && prevNoTerminal && !isBullet && prevWc > 5) {
|
|
merged[merged.length - 1] = prev + " " + block;
|
|
continue;
|
|
}
|
|
|
|
// Merge continuation phrases even when previous has terminal punctuation
|
|
// (handles sentences split across HTML block elements or page breaks)
|
|
// No prevWc guard — short headings between content and continuation shouldn't block merge
|
|
if (CONTINUATION_RE.test(block) && !isBullet) {
|
|
merged[merged.length - 1] = prev + " " + block;
|
|
continue;
|
|
}
|
|
|
|
merged.push(block);
|
|
}
|
|
|
|
return merged;
|
|
}
|
|
|
|
/**
|
|
* Segment extracted section text into paragraphs suitable for labeling.
|
|
*
|
|
* - Splits on double newlines (natural paragraph breaks from HTML extraction)
|
|
* - Merges bullet lists with their intro sentence
|
|
* - Merges lowercase continuations with preceding block
|
|
* - Strips leading orphaned punctuation
|
|
* - Deduplicates identical text within a filing
|
|
* - Filters out paragraphs under MIN_WORDS (headers, whitespace artifacts)
|
|
* - Splits paragraphs over MAX_WORDS at sentence boundaries
|
|
* - Assigns UUIDs and paragraph indices
|
|
*/
|
|
export function segmentParagraphs(
|
|
sectionText: string,
|
|
filingMeta: FilingMeta,
|
|
): Paragraph[] {
|
|
const rawBlocks = sectionText.split(/\n\n+/);
|
|
const blocks = mergeBlocks(rawBlocks);
|
|
const paragraphs: Paragraph[] = [];
|
|
const seen = new Set<string>();
|
|
let idx = 0;
|
|
|
|
function addParagraph(text: string) {
|
|
// Strip leading orphaned punctuation (": text" → "text")
|
|
const stripped = text.replace(LEADING_PUNCT, "");
|
|
if (stripped.length === 0) return;
|
|
|
|
const hash = textHash(stripped);
|
|
if (seen.has(hash)) return;
|
|
seen.add(hash);
|
|
|
|
const wc = wordCount(stripped);
|
|
if (wc < MIN_WORDS) return;
|
|
|
|
if (wc > MAX_WORDS) {
|
|
for (const chunk of splitAtSentences(stripped)) {
|
|
const chunkWc = wordCount(chunk);
|
|
if (chunkWc < MIN_WORDS) continue;
|
|
const chunkHash = textHash(chunk);
|
|
if (seen.has(chunkHash)) continue;
|
|
seen.add(chunkHash);
|
|
paragraphs.push({
|
|
id: uuidv4(),
|
|
text: chunk,
|
|
textHash: chunkHash,
|
|
wordCount: chunkWc,
|
|
paragraphIndex: idx++,
|
|
filing: filingMeta,
|
|
});
|
|
}
|
|
} else {
|
|
paragraphs.push({
|
|
id: uuidv4(),
|
|
text: stripped,
|
|
textHash: hash,
|
|
wordCount: wc,
|
|
paragraphIndex: idx++,
|
|
filing: filingMeta,
|
|
});
|
|
}
|
|
}
|
|
|
|
for (const block of blocks) {
|
|
const stripped = block.replace(LEADING_PUNCT, "");
|
|
if (stripped.length === 0) continue;
|
|
|
|
const wc = wordCount(stripped);
|
|
|
|
// Short blocks: append to previous paragraph instead of dropping,
|
|
// but only if it completes a sentence or previous was already broken
|
|
if (wc < MIN_WORDS && paragraphs.length > 0) {
|
|
const prev = paragraphs[paragraphs.length - 1]!;
|
|
const fragmentCompletes = TERMINAL_PUNCT.test(stripped);
|
|
const prevAlreadyBroken = !TERMINAL_PUNCT.test(prev.text);
|
|
if (fragmentCompletes || prevAlreadyBroken) {
|
|
const combined = prev.text + " " + stripped;
|
|
const combinedWc = wordCount(combined);
|
|
if (combinedWc <= MAX_WORDS) {
|
|
prev.text = combined;
|
|
prev.textHash = textHash(combined);
|
|
prev.wordCount = combinedWc;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
addParagraph(stripped);
|
|
}
|
|
|
|
return paragraphs;
|
|
}
|