"""
Investigate whether certain SEC filing generators produce systematically worse
text extraction in the SEC-cyBERT corpus. READ-ONLY analysis.
"""
import json
import os
import random
import re
from collections import Counter, defaultdict
from pathlib import Path
random.seed(42)
HTML_DIR = Path("data/raw/html")
PARAGRAPHS_FILE = Path("data/paragraphs/paragraphs-clean.jsonl")
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def extract_generator(header_bytes: bytes) -> str:
"""Extract generator from first ~5KB of an HTML file."""
text = header_bytes.decode("utf-8", errors="replace")
# 1.
m = re.search(
r'
m = re.search(r'