import { v4 as uuidv4 } from "uuid"; import { createHash } from "node:crypto"; import type { Paragraph, FilingMeta } from "../schemas/paragraph.ts"; const MIN_WORDS = 20; const MAX_WORDS = 500; /** Normalize text for hashing: lowercase, collapse whitespace, strip punctuation variants. */ function normalizeForHash(text: string): string { return text .toLowerCase() .replace(/\s+/g, " ") .replace(/['']/g, "'") .replace(/[""]/g, '"') .replace(/[—–]/g, "-") .trim(); } /** SHA-256 hash of normalized text for cross-filing dedup tracking. */ function textHash(text: string): string { return createHash("sha256").update(normalizeForHash(text)).digest("hex").slice(0, 16); } /** Count words in a string. */ function wordCount(text: string): number { return text.split(/\s+/).filter((w) => w.length > 0).length; } /** Split a long text block at sentence boundaries to stay under MAX_WORDS. */ function splitAtSentences(text: string): string[] { // Split on sentence-ending punctuation followed by whitespace const sentences = text.match(/[^.!?]+[.!?]+[\s"]*/g) ?? [text]; const chunks: string[] = []; let current = ""; for (const sentence of sentences) { const combined = current + sentence; if (wordCount(combined) > MAX_WORDS && current.length > 0) { chunks.push(current.trim()); current = sentence; } else { current = combined; } } if (current.trim().length > 0) chunks.push(current.trim()); return chunks; } const BULLET_RE = /^[•\-·▪►■◦‣⁃–—]\s/; const CONTAINS_BULLET = /[•·▪►■◦‣⁃–—]\s/; const ENDS_WITH_COLON = /:\s*$/; const TERMINAL_PUNCT = /[.!?;")\u201d]\s*$/; const STARTS_LOWERCASE = /^[a-z]/; const LEADING_PUNCT = /^[":.,;]\s*/; // Continuation phrases that indicate a block is mid-sentence regardless of previous punctuation const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i; /** * Pre-merge raw blocks: join bullet lists with their intro sentence, * and rejoin lowercase continuations split across block boundaries. */ function mergeBlocks(rawBlocks: string[]): string[] { const cleaned: string[] = []; for (const raw of rawBlocks) { const text = raw.replace(/\s+/g, " ").trim(); if (text.length > 0) cleaned.push(text); } const merged: string[] = []; for (const block of cleaned) { if (merged.length === 0) { merged.push(block); continue; } const prev = merged[merged.length - 1]!; const prevWc = wordCount(prev); const combinedWc = prevWc + wordCount(block); // Don't merge if result would exceed MAX_WORDS if (combinedWc > MAX_WORDS) { merged.push(block); continue; } const isBullet = BULLET_RE.test(block); const prevContainsBullet = CONTAINS_BULLET.test(prev); const prevEndsColon = ENDS_WITH_COLON.test(prev); const prevNoTerminal = !TERMINAL_PUNCT.test(prev); // Merge bullet with preceding intro (ends with ":") or preceding bullet list if (isBullet && (prevEndsColon || prevContainsBullet)) { merged[merged.length - 1] = prev + " " + block; continue; } // Merge lowercase continuation when previous lacks terminal punctuation if (STARTS_LOWERCASE.test(block) && prevNoTerminal && !isBullet && prevWc > 5) { merged[merged.length - 1] = prev + " " + block; continue; } // Merge continuation phrases even when previous has terminal punctuation // (handles sentences split across HTML block elements or page breaks) // No prevWc guard — short headings between content and continuation shouldn't block merge if (CONTINUATION_RE.test(block) && !isBullet) { merged[merged.length - 1] = prev + " " + block; continue; } merged.push(block); } return merged; } /** * Segment extracted section text into paragraphs suitable for labeling. * * - Splits on double newlines (natural paragraph breaks from HTML extraction) * - Merges bullet lists with their intro sentence * - Merges lowercase continuations with preceding block * - Strips leading orphaned punctuation * - Deduplicates identical text within a filing * - Filters out paragraphs under MIN_WORDS (headers, whitespace artifacts) * - Splits paragraphs over MAX_WORDS at sentence boundaries * - Assigns UUIDs and paragraph indices */ export function segmentParagraphs( sectionText: string, filingMeta: FilingMeta, ): Paragraph[] { const rawBlocks = sectionText.split(/\n\n+/); const blocks = mergeBlocks(rawBlocks); const paragraphs: Paragraph[] = []; const seen = new Set(); let idx = 0; function addParagraph(text: string) { // Strip leading orphaned punctuation (": text" → "text") const stripped = text.replace(LEADING_PUNCT, ""); if (stripped.length === 0) return; const hash = textHash(stripped); if (seen.has(hash)) return; seen.add(hash); const wc = wordCount(stripped); if (wc < MIN_WORDS) return; if (wc > MAX_WORDS) { for (const chunk of splitAtSentences(stripped)) { const chunkWc = wordCount(chunk); if (chunkWc < MIN_WORDS) continue; const chunkHash = textHash(chunk); if (seen.has(chunkHash)) continue; seen.add(chunkHash); paragraphs.push({ id: uuidv4(), text: chunk, textHash: chunkHash, wordCount: chunkWc, paragraphIndex: idx++, filing: filingMeta, }); } } else { paragraphs.push({ id: uuidv4(), text: stripped, textHash: hash, wordCount: wc, paragraphIndex: idx++, filing: filingMeta, }); } } for (const block of blocks) { const stripped = block.replace(LEADING_PUNCT, ""); if (stripped.length === 0) continue; const wc = wordCount(stripped); // Short blocks: append to previous paragraph instead of dropping, // but only if it completes a sentence or previous was already broken if (wc < MIN_WORDS && paragraphs.length > 0) { const prev = paragraphs[paragraphs.length - 1]!; const fragmentCompletes = TERMINAL_PUNCT.test(stripped); const prevAlreadyBroken = !TERMINAL_PUNCT.test(prev.text); if (fragmentCompletes || prevAlreadyBroken) { const combined = prev.text + " " + stripped; const combinedWc = wordCount(combined); if (combinedWc <= MAX_WORDS) { prev.text = combined; prev.textHash = textHash(combined); prev.wordCount = combinedWc; continue; } } } addParagraph(stripped); } return paragraphs; }