""" Investigate whether certain SEC filing generators produce systematically worse text extraction in the SEC-cyBERT corpus. READ-ONLY analysis. """ import json import os import random import re from collections import Counter, defaultdict from pathlib import Path random.seed(42) HTML_DIR = Path("data/raw/html") PARAGRAPHS_FILE = Path("data/paragraphs/paragraphs-clean.jsonl") # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── def extract_generator(header_bytes: bytes) -> str: """Extract generator from first ~5KB of an HTML file.""" text = header_bytes.decode("utf-8", errors="replace") # 1. m = re.search( r' m = re.search(r'