SEC-cyBERT/ts/src/extract/segment.ts

import { v4 as uuidv4 } from "uuid";
import { createHash } from "node:crypto";
import type { Paragraph, FilingMeta } from "../schemas/paragraph.ts";

const MIN_WORDS = 20;
const MAX_WORDS = 500;

/** Normalize text for hashing: lowercase, collapse whitespace, strip punctuation variants. */
function normalizeForHash(text: string): string {
  return text
    .toLowerCase()
    .replace(/\s+/g, " ")
    .replace(/['']/g, "'")
    .replace(/[""]/g, '"')
    .replace(/[—–]/g, "-")
    .trim();
}

/** SHA-256 hash of normalized text for cross-filing dedup tracking. */
function textHash(text: string): string {
  return createHash("sha256").update(normalizeForHash(text)).digest("hex").slice(0, 16);
}

/** Count words in a string. */
function wordCount(text: string): number {
  return text.split(/\s+/).filter((w) => w.length > 0).length;
}

/** Split a long text block at sentence boundaries to stay under MAX_WORDS. */
function splitAtSentences(text: string): string[] {
  // Split on sentence-ending punctuation followed by whitespace
  const sentences = text.match(/[^.!?]+[.!?]+[\s"]*/g) ?? [text];
  const chunks: string[] = [];
  let current = "";

  for (const sentence of sentences) {
    const combined = current + sentence;
    if (wordCount(combined) > MAX_WORDS && current.length > 0) {
      chunks.push(current.trim());
      current = sentence;
    } else {
      current = combined;
    }
  }
  if (current.trim().length > 0) chunks.push(current.trim());

  return chunks;
}

const BULLET_RE = /^[•\-·▪►■◦‣⁃–—]\s/;
const CONTAINS_BULLET = /[•·▪►■◦‣⁃–—]\s/;
const ENDS_WITH_COLON = /:\s*$/;
const TERMINAL_PUNCT = /[.!?;")\u201d]\s*$/;
const STARTS_LOWERCASE = /^[a-z]/;
const LEADING_PUNCT = /^[":.,;]\s*/;
// Continuation phrases that indicate a block is mid-sentence regardless of previous punctuation
const CONTINUATION_RE = /^(and |or |including |such as |along with |that |which |where |whether |as well as |as described |for example|for more |pursuant to |in addition )/i;

/**
 * Pre-merge raw blocks: join bullet lists with their intro sentence,
 * and rejoin lowercase continuations split across block boundaries.
 */
function mergeBlocks(rawBlocks: string[]): string[] {
  const cleaned: string[] = [];
  for (const raw of rawBlocks) {
    const text = raw.replace(/\s+/g, " ").trim();
    if (text.length > 0) cleaned.push(text);
  }

  const merged: string[] = [];
  for (const block of cleaned) {
    if (merged.length === 0) {
      merged.push(block);
      continue;
    }

    const prev = merged[merged.length - 1]!;
    const prevWc = wordCount(prev);
    const combinedWc = prevWc + wordCount(block);

    // Don't merge if result would exceed MAX_WORDS
    if (combinedWc > MAX_WORDS) {
      merged.push(block);
      continue;
    }

    const isBullet = BULLET_RE.test(block);
    const prevContainsBullet = CONTAINS_BULLET.test(prev);
    const prevEndsColon = ENDS_WITH_COLON.test(prev);
    const prevNoTerminal = !TERMINAL_PUNCT.test(prev);

    // Merge bullet with preceding intro (ends with ":") or preceding bullet list
    if (isBullet && (prevEndsColon || prevContainsBullet)) {
      merged[merged.length - 1] = prev + " " + block;
      continue;
    }

    // Merge lowercase continuation when previous lacks terminal punctuation
    if (STARTS_LOWERCASE.test(block) && prevNoTerminal && !isBullet && prevWc > 5) {
      merged[merged.length - 1] = prev + " " + block;
      continue;
    }

    // Merge continuation phrases even when previous has terminal punctuation
    // (handles sentences split across HTML block elements or page breaks)
    // No prevWc guard — short headings between content and continuation shouldn't block merge
    if (CONTINUATION_RE.test(block) && !isBullet) {
      merged[merged.length - 1] = prev + " " + block;
      continue;
    }

    merged.push(block);
  }

  return merged;
}

/**
 * Segment extracted section text into paragraphs suitable for labeling.
 *
 * - Splits on double newlines (natural paragraph breaks from HTML extraction)
 * - Merges bullet lists with their intro sentence
 * - Merges lowercase continuations with preceding block
 * - Strips leading orphaned punctuation
 * - Deduplicates identical text within a filing
 * - Filters out paragraphs under MIN_WORDS (headers, whitespace artifacts)
 * - Splits paragraphs over MAX_WORDS at sentence boundaries
 * - Assigns UUIDs and paragraph indices
 */
export function segmentParagraphs(
  sectionText: string,
  filingMeta: FilingMeta,
): Paragraph[] {
  const rawBlocks = sectionText.split(/\n\n+/);
  const blocks = mergeBlocks(rawBlocks);
  const paragraphs: Paragraph[] = [];
  const seen = new Set<string>();
  let idx = 0;

  function addParagraph(text: string) {
    // Strip leading orphaned punctuation (": text" → "text")
    const stripped = text.replace(LEADING_PUNCT, "");
    if (stripped.length === 0) return;

    const hash = textHash(stripped);
    if (seen.has(hash)) return;
    seen.add(hash);

    const wc = wordCount(stripped);
    if (wc < MIN_WORDS) return;

    if (wc > MAX_WORDS) {
      for (const chunk of splitAtSentences(stripped)) {
        const chunkWc = wordCount(chunk);
        if (chunkWc < MIN_WORDS) continue;
        const chunkHash = textHash(chunk);
        if (seen.has(chunkHash)) continue;
        seen.add(chunkHash);
        paragraphs.push({
          id: uuidv4(),
          text: chunk,
          textHash: chunkHash,
          wordCount: chunkWc,
          paragraphIndex: idx++,
          filing: filingMeta,
        });
      }
    } else {
      paragraphs.push({
        id: uuidv4(),
        text: stripped,
        textHash: hash,
        wordCount: wc,
        paragraphIndex: idx++,
        filing: filingMeta,
      });
    }
  }

  for (const block of blocks) {
    const stripped = block.replace(LEADING_PUNCT, "");
    if (stripped.length === 0) continue;

    const wc = wordCount(stripped);

    // Short blocks: append to previous paragraph instead of dropping,
    // but only if it completes a sentence or previous was already broken
    if (wc < MIN_WORDS && paragraphs.length > 0) {
      const prev = paragraphs[paragraphs.length - 1]!;
      const fragmentCompletes = TERMINAL_PUNCT.test(stripped);
      const prevAlreadyBroken = !TERMINAL_PUNCT.test(prev.text);
      if (fragmentCompletes || prevAlreadyBroken) {
        const combined = prev.text + " " + stripped;
        const combinedWc = wordCount(combined);
        if (combinedWc <= MAX_WORDS) {
          prev.text = combined;
          prev.textHash = textHash(combined);
          prev.wordCount = combinedWc;
          continue;
        }
      }
    }

    addParagraph(stripped);
  }

  return paragraphs;
}