SEC-cyBERT/scripts/generator_analysis.py

"""
Investigate whether certain SEC filing generators produce systematically worse
text extraction in the SEC-cyBERT corpus. READ-ONLY analysis.
"""

import json
import os
import random
import re
from collections import Counter, defaultdict
from pathlib import Path

random.seed(42)

HTML_DIR = Path("data/raw/html")
PARAGRAPHS_FILE = Path("data/paragraphs/paragraphs-clean.jsonl")

# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────

def extract_generator(header_bytes: bytes) -> str:
    """Extract generator from first ~5KB of an HTML file."""
    text = header_bytes.decode("utf-8", errors="replace")

    # 1. <meta name="generator" content="...">
    m = re.search(
        r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
        text, re.IGNORECASE
    )
    if m:
        return m.group(1).strip()

    # Also try content before name order
    m = re.search(
        r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']',
        text, re.IGNORECASE
    )
    if m:
        return m.group(1).strip()

    # 2. <!-- Generated by ... -->
    m = re.search(r'<!--\s*Generated\s+by\s+([^->]+)', text, re.IGNORECASE)
    if m:
        return m.group(1).strip()

    # 3. Distinctive patterns
    if "Workiva" in text or "wkiva" in text.lower():
        return "Workiva (pattern)"
    if "ix:header" in text.lower() or "ix:hidden" in text.lower():
        # iXBRL inline — common but not a specific generator
        pass
    if "toppanmerrill" in text.lower() or "toppan" in text.lower():
        return "Toppan Merrill (pattern)"
    if "donnelley" in text.lower() or "EDGAR Online" in text.lower():
        return "Donnelley/EDGAR Online (pattern)"
    if "GoXBRL" in text:
        return "GoXBRL (pattern)"

    return "UNKNOWN"


def normalize_generator(raw: str) -> str:
    """Normalize generator strings to canonical names."""
    low = raw.lower()
    if "workiva" in low or "wdesk" in low or "wkiva" in low:
        return "Workiva"
    if "toppan" in low or "merrill" in low:
        return "Toppan Merrill"
    if "donnelley" in low or "edgar online" in low:
        return "Donnelley"
    if "goxbrl" in low:
        return "GoXBRL"
    if "word" in low or "microsoft" in low:
        return "Microsoft Word"
    if "webfilings" in low:
        return "WebFilings"
    if "novaworks" in low:
        return "Novaworks"
    if "ez-xbrl" in low or "ezxbrl" in low:
        return "EZ-XBRL"
    if "ixbrl" in low or "inline xbrl" in low:
        return "iXBRL Generator"
    if "vintage" in low:
        return "Vintage (Donnelley)"
    if "edgar" in low:
        return "EDGAR"
    if raw == "UNKNOWN":
        return "UNKNOWN"
    return raw  # keep as-is if no match


def read_generator_for_file(filepath: Path) -> str:
    """Read the first 5KB and extract the generator."""
    try:
        with open(filepath, "rb") as f:
            header = f.read(5000)
        return normalize_generator(extract_generator(header))
    except Exception:
        return "ERROR"


# ─────────────────────────────────────────────────────────────────────────────
# Step 0: Load paragraphs
# ─────────────────────────────────────────────────────────────────────────────
print("Loading paragraphs...")
paragraphs = []
filing_paragraphs = defaultdict(list)  # accession -> [paragraph dicts]
with open(PARAGRAPHS_FILE) as f:
    for line in f:
        p = json.loads(line)
        paragraphs.append(p)
        acc = p["filing"]["accessionNumber"]
        filing_paragraphs[acc].append(p)

print(f"  Loaded {len(paragraphs):,} paragraphs from {len(filing_paragraphs):,} filings\n")

# ─────────────────────────────────────────────────────────────────────────────
# Step 1: Identify filing generators (500 random HTML files)
# ─────────────────────────────────────────────────────────────────────────────
print("=" * 80)
print("STEP 1: IDENTIFY FILING GENERATORS (500-file sample)")
print("=" * 80)

all_html_files = sorted(HTML_DIR.glob("*.html"))
sample_files = random.sample(all_html_files, min(500, len(all_html_files)))

sample_generators = {}  # filename_stem -> generator
raw_generator_strings = []

for f in sample_files:
    try:
        with open(f, "rb") as fh:
            header = fh.read(5000)
        raw = extract_generator(header)
        raw_generator_strings.append(raw)
        gen = normalize_generator(raw)
        sample_generators[f.stem] = gen
    except Exception:
        sample_generators[f.stem] = "ERROR"

gen_counts = Counter(sample_generators.values())
print(f"\nGenerator distribution (500-file sample):\n")
print(f"  {'Generator':<30} {'Count':>6} {'%':>7}")
print(f"  {'-'*30} {'-'*6} {'-'*7}")
for gen, count in gen_counts.most_common():
    print(f"  {gen:<30} {count:>6} {count/5:.1f}%")

print(f"\nRaw generator strings (unique):")
raw_counts = Counter(raw_generator_strings)
for raw, count in raw_counts.most_common(20):
    print(f"  [{count:>4}] {raw[:80]}")


# ─────────────────────────────────────────────────────────────────────────────
# Step 2: Generator-specific quality metrics
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("STEP 2: GENERATOR-SPECIFIC QUALITY METRICS")
print("=" * 80)

# Major generators: those with >20 filings in sample
major_gens = {g for g, c in gen_counts.items() if c > 20}
print(f"\nMajor generators (>20 in sample): {sorted(major_gens)}\n")

# For each sampled filing that has paragraphs, compute metrics
gen_metrics = defaultdict(lambda: {
    "filing_count": 0,
    "para_counts": [],
    "word_counts": [],
    "lowercase_starts": 0,
    "total_paras": 0,
    "short_paras": 0,  # <25 words
    "html_sizes": [],
    "text_sizes": [],
})

for stem, gen in sample_generators.items():
    if gen not in major_gens:
        continue

    acc = stem  # filename stem is the accession number
    paras = filing_paragraphs.get(acc, [])

    m = gen_metrics[gen]
    m["filing_count"] += 1
    m["para_counts"].append(len(paras))

    # HTML file size
    html_path = HTML_DIR / f"{stem}.html"
    try:
        html_size = html_path.stat().st_size
    except Exception:
        html_size = 0
    m["html_sizes"].append(html_size)

    total_text_len = 0
    for p in paras:
        wc = p.get("wordCount", len(p["text"].split()))
        m["word_counts"].append(wc)
        m["total_paras"] += 1
        total_text_len += len(p["text"])

        if p["text"] and p["text"][0].islower():
            m["lowercase_starts"] += 1
        if wc < 25:
            m["short_paras"] += 1

    m["text_sizes"].append(total_text_len)

# Print table
print(f"  {'Generator':<22} {'Files':>5} {'Avg ¶':>7} {'Avg WC':>7} {'%lc':>6} {'%short':>7} {'ExtRatio':>9}")
print(f"  {'-'*22} {'-'*5} {'-'*7} {'-'*7} {'-'*6} {'-'*7} {'-'*9}")

for gen in sorted(major_gens):
    m = gen_metrics[gen]
    n = m["filing_count"]
    if n == 0:
        continue
    avg_paras = sum(m["para_counts"]) / n if n else 0
    avg_wc = sum(m["word_counts"]) / len(m["word_counts"]) if m["word_counts"] else 0
    pct_lc = (m["lowercase_starts"] / m["total_paras"] * 100) if m["total_paras"] else 0
    pct_short = (m["short_paras"] / m["total_paras"] * 100) if m["total_paras"] else 0

    # Extraction ratio: total text bytes / html bytes
    total_html = sum(m["html_sizes"])
    total_text = sum(m["text_sizes"])
    ext_ratio = (total_text / total_html * 100) if total_html else 0

    print(f"  {gen:<22} {n:>5} {avg_paras:>7.1f} {avg_wc:>7.1f} {pct_lc:>5.1f}% {pct_short:>6.1f}% {ext_ratio:>8.2f}%")


# ─────────────────────────────────────────────────────────────────────────────
# Step 3: HTML structure analysis — representative snippets
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("STEP 3: HTML STRUCTURE ANALYSIS (paragraph encoding by generator)")
print("=" * 80)

top5_gens = [g for g, _ in gen_counts.most_common(5)]

for gen in top5_gens:
    # Find a sample file for this generator
    sample_acc = None
    for stem, g in sample_generators.items():
        if g == gen:
            sample_acc = stem
            break
    if not sample_acc:
        continue

    html_path = HTML_DIR / f"{sample_acc}.html"
    try:
        with open(html_path, "r", errors="replace") as fh:
            content = fh.read(50000)  # read enough to find a paragraph

        # Find a <p> tag or similar paragraph structure
        # Look for a <p tag with content
        m = re.search(r'(<p\b[^>]*>[^<]{20,})', content, re.IGNORECASE)
        if m:
            snippet = m.group(1)[:200]
        else:
            # Try <div> or <span> with text
            m = re.search(r'(<(?:div|span)\b[^>]*>[^<]{20,})', content, re.IGNORECASE)
            if m:
                snippet = m.group(1)[:200]
            else:
                snippet = "(no paragraph tag found in first 50KB)"
    except Exception as e:
        snippet = f"(error: {e})"

    print(f"\n  Generator: {gen}")
    print(f"  File: {sample_acc}.html")
    print(f"  Snippet: {snippet}")
    print()


# ─────────────────────────────────────────────────────────────────────────────
# Step 4: Generator fingerprinting of problem paragraphs
# ─────────────────────────────────────────────────────────────────────────────
print("=" * 80)
print("STEP 4: GENERATOR FINGERPRINTING OF PROBLEM PARAGRAPHS")
print("=" * 80)

# Identify problem paragraphs
lowercase_paras = []
long_paras = []  # >300 words
short_paras = []  # <25 words

for p in paragraphs:
    wc = p.get("wordCount", len(p["text"].split()))
    if p["text"] and p["text"][0].islower():
        lowercase_paras.append(p)
    if wc > 300:
        long_paras.append(p)
    if wc < 25:
        short_paras.append(p)

print(f"\n  Problem paragraph counts:")
print(f"    Lowercase starts: {len(lowercase_paras):,}")
print(f"    Long (>300 words): {len(long_paras):,}")
print(f"    Short (<25 words): {len(short_paras):,}")
print(f"    Total paragraphs: {len(paragraphs):,}")

# For each category, sample up to 200 and look up generators
# We need a cache of accession -> generator since we may need to read many files
print("\n  Building generator cache for problem filings...")
problem_accessions = set()
for p in lowercase_paras:
    problem_accessions.add(p["filing"]["accessionNumber"])
for p in long_paras:
    problem_accessions.add(p["filing"]["accessionNumber"])
for p in short_paras:
    problem_accessions.add(p["filing"]["accessionNumber"])

# Also get generators for ALL filings to compute baseline
print("  Reading generators for ALL filings in the corpus...")
all_accessions = set(filing_paragraphs.keys())
acc_generator = {}

for acc in all_accessions:
    html_path = HTML_DIR / f"{acc}.html"
    if html_path.exists():
        acc_generator[acc] = read_generator_for_file(html_path)
    else:
        acc_generator[acc] = "FILE_MISSING"

# Baseline distribution
baseline_gen_counts = Counter(acc_generator.values())
print(f"\n  Full corpus generator distribution ({len(acc_generator):,} filings):\n")
print(f"    {'Generator':<30} {'Count':>6} {'%':>7}")
print(f"    {'-'*30} {'-'*6} {'-'*7}")
total_filings = len(acc_generator)
for gen, count in baseline_gen_counts.most_common(15):
    print(f"    {gen:<30} {count:>6} {count/total_filings*100:>6.1f}%")


def analyze_problem_category(name, problem_list, acc_generator, baseline_gen_counts, total_filings):
    """Analyze which generators are over-represented in a problem category."""
    print(f"\n  --- {name} ({len(problem_list):,} paragraphs) ---")

    # Count generators for problem paragraphs (by paragraph, not by filing)
    gen_para_counts = Counter()
    for p in problem_list:
        acc = p["filing"]["accessionNumber"]
        gen = acc_generator.get(acc, "UNKNOWN")
        gen_para_counts[gen] += 1

    total_problem = len(problem_list)
    total_all = len(paragraphs)

    print(f"    {'Generator':<30} {'# Problem':>9} {'% of Prob':>9} {'% of All':>9} {'Over-rep':>9}")
    print(f"    {'-'*30} {'-'*9} {'-'*9} {'-'*9} {'-'*9}")

    # Compute total paragraphs per generator
    gen_all_para_counts = Counter()
    for p in paragraphs:
        acc = p["filing"]["accessionNumber"]
        gen = acc_generator.get(acc, "UNKNOWN")
        gen_all_para_counts[gen] += 1

    for gen, prob_count in gen_para_counts.most_common(10):
        pct_of_problem = prob_count / total_problem * 100 if total_problem else 0
        all_count = gen_all_para_counts.get(gen, 1)
        pct_of_all = all_count / total_all * 100 if total_all else 0
        over_rep = pct_of_problem / pct_of_all if pct_of_all else 0
        print(f"    {gen:<30} {prob_count:>9,} {pct_of_problem:>8.1f}% {pct_of_all:>8.1f}% {over_rep:>8.2f}x")

    # Show a few example problem texts
    print(f"\n    Example texts:")
    for p in problem_list[:3]:
        text = p["text"][:120].replace("\n", " ")
        acc = p["filing"]["accessionNumber"]
        gen = acc_generator.get(acc, "?")
        print(f"      [{gen}] {text}...")


analyze_problem_category("Lowercase starts (orphan words)", lowercase_paras, acc_generator, baseline_gen_counts, total_filings)
analyze_problem_category("Long paragraphs (>300 words, potential merges)", long_paras, acc_generator, baseline_gen_counts, total_filings)
analyze_problem_category("Short paragraphs (<25 words, potential fragments)", short_paras, acc_generator, baseline_gen_counts, total_filings)


# ─────────────────────────────────────────────────────────────────────────────
# Step 5: Filing size vs extraction quality
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("STEP 5: FILING SIZE vs EXTRACTION QUALITY")
print("=" * 80)

# Compute HTML size and paragraph count for all filings
size_para_data = []
for acc, paras_list in filing_paragraphs.items():
    html_path = HTML_DIR / f"{acc}.html"
    try:
        html_size = html_path.stat().st_size
    except Exception:
        continue
    size_para_data.append({
        "acc": acc,
        "html_size": html_size,
        "para_count": len(paras_list),
        "generator": acc_generator.get(acc, "UNKNOWN"),
    })

# Bin by size ranges
size_bins = [
    (0, 50_000, "<50KB"),
    (50_000, 200_000, "50-200KB"),
    (200_000, 500_000, "200-500KB"),
    (500_000, 1_000_000, "500KB-1MB"),
    (1_000_000, 5_000_000, "1-5MB"),
    (5_000_000, float("inf"), ">5MB"),
]

print(f"\n  HTML Size vs Extracted Paragraphs:\n")
print(f"    {'Size Range':<15} {'Files':>6} {'Avg ¶':>7} {'Med ¶':>7} {'Min ¶':>6} {'Max ¶':>6}")
print(f"    {'-'*15} {'-'*6} {'-'*7} {'-'*7} {'-'*6} {'-'*6}")

for lo, hi, label in size_bins:
    in_bin = [d for d in size_para_data if lo <= d["html_size"] < hi]
    if not in_bin:
        continue
    counts = sorted([d["para_count"] for d in in_bin])
    avg = sum(counts) / len(counts)
    med = counts[len(counts) // 2]
    print(f"    {label:<15} {len(in_bin):>6} {avg:>7.1f} {med:>7} {min(counts):>6} {max(counts):>6}")

# Large HTML files with very few paragraphs — likely extraction failures
print(f"\n  Potential extraction failures (HTML >1MB but ≤2 paragraphs):\n")
big_few = [d for d in size_para_data if d["html_size"] > 1_000_000 and d["para_count"] <= 2]
big_few.sort(key=lambda d: d["html_size"], reverse=True)

if not big_few:
    # Relax threshold
    print("  (None found with >1MB and ≤2 paragraphs. Relaxing to >500KB and ≤3 paragraphs)\n")
    big_few = [d for d in size_para_data if d["html_size"] > 500_000 and d["para_count"] <= 3]
    big_few.sort(key=lambda d: d["html_size"], reverse=True)

print(f"    {'Accession':<30} {'HTML Size':>12} {'Paras':>6} {'Generator':<25}")
print(f"    {'-'*30} {'-'*12} {'-'*6} {'-'*25}")
for d in big_few[:10]:
    size_str = f"{d['html_size']/1024/1024:.2f} MB" if d['html_size'] > 1_000_000 else f"{d['html_size']/1024:.0f} KB"
    print(f"    {d['acc']:<30} {size_str:>12} {d['para_count']:>6} {d['generator']:<25}")

# Also show the reverse: small HTML with many paragraphs
print(f"\n  Unusual: Small HTML (<50KB) with many paragraphs (>15):\n")
small_many = [d for d in size_para_data if d["html_size"] < 50_000 and d["para_count"] > 15]
small_many.sort(key=lambda d: d["para_count"], reverse=True)

print(f"    {'Accession':<30} {'HTML Size':>12} {'Paras':>6} {'Generator':<25}")
print(f"    {'-'*30} {'-'*12} {'-'*6} {'-'*25}")
for d in small_many[:10]:
    size_str = f"{d['html_size']/1024:.0f} KB"
    print(f"    {d['acc']:<30} {size_str:>12} {d['para_count']:>6} {d['generator']:<25}")


# ─────────────────────────────────────────────────────────────────────────────
# Summary
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)

print("""
Key findings are printed above. Look for:
1. Which generators dominate the corpus
2. Whether any generator has notably worse extraction metrics (low para count,
   high % lowercase starts, low extraction ratio)
3. Whether problem paragraphs cluster around specific generators (over-rep > 1.5x)
4. Whether large-HTML / few-paragraph cases cluster on a specific generator
""")