SEC-cyBERT/scripts/generator_analysis.py
2026-03-29 20:33:39 -04:00

472 lines
19 KiB
Python

"""
Investigate whether certain SEC filing generators produce systematically worse
text extraction in the SEC-cyBERT corpus. READ-ONLY analysis.
"""
import json
import os
import random
import re
from collections import Counter, defaultdict
from pathlib import Path
random.seed(42)
HTML_DIR = Path("data/raw/html")
PARAGRAPHS_FILE = Path("data/paragraphs/paragraphs-clean.jsonl")
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def extract_generator(header_bytes: bytes) -> str:
"""Extract generator from first ~5KB of an HTML file."""
text = header_bytes.decode("utf-8", errors="replace")
# 1. <meta name="generator" content="...">
m = re.search(
r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
text, re.IGNORECASE
)
if m:
return m.group(1).strip()
# Also try content before name order
m = re.search(
r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']',
text, re.IGNORECASE
)
if m:
return m.group(1).strip()
# 2. <!-- Generated by ... -->
m = re.search(r'<!--\s*Generated\s+by\s+([^->]+)', text, re.IGNORECASE)
if m:
return m.group(1).strip()
# 3. Distinctive patterns
if "Workiva" in text or "wkiva" in text.lower():
return "Workiva (pattern)"
if "ix:header" in text.lower() or "ix:hidden" in text.lower():
# iXBRL inline — common but not a specific generator
pass
if "toppanmerrill" in text.lower() or "toppan" in text.lower():
return "Toppan Merrill (pattern)"
if "donnelley" in text.lower() or "EDGAR Online" in text.lower():
return "Donnelley/EDGAR Online (pattern)"
if "GoXBRL" in text:
return "GoXBRL (pattern)"
return "UNKNOWN"
def normalize_generator(raw: str) -> str:
"""Normalize generator strings to canonical names."""
low = raw.lower()
if "workiva" in low or "wdesk" in low or "wkiva" in low:
return "Workiva"
if "toppan" in low or "merrill" in low:
return "Toppan Merrill"
if "donnelley" in low or "edgar online" in low:
return "Donnelley"
if "goxbrl" in low:
return "GoXBRL"
if "word" in low or "microsoft" in low:
return "Microsoft Word"
if "webfilings" in low:
return "WebFilings"
if "novaworks" in low:
return "Novaworks"
if "ez-xbrl" in low or "ezxbrl" in low:
return "EZ-XBRL"
if "ixbrl" in low or "inline xbrl" in low:
return "iXBRL Generator"
if "vintage" in low:
return "Vintage (Donnelley)"
if "edgar" in low:
return "EDGAR"
if raw == "UNKNOWN":
return "UNKNOWN"
return raw # keep as-is if no match
def read_generator_for_file(filepath: Path) -> str:
"""Read the first 5KB and extract the generator."""
try:
with open(filepath, "rb") as f:
header = f.read(5000)
return normalize_generator(extract_generator(header))
except Exception:
return "ERROR"
# ─────────────────────────────────────────────────────────────────────────────
# Step 0: Load paragraphs
# ─────────────────────────────────────────────────────────────────────────────
print("Loading paragraphs...")
paragraphs = []
filing_paragraphs = defaultdict(list) # accession -> [paragraph dicts]
with open(PARAGRAPHS_FILE) as f:
for line in f:
p = json.loads(line)
paragraphs.append(p)
acc = p["filing"]["accessionNumber"]
filing_paragraphs[acc].append(p)
print(f" Loaded {len(paragraphs):,} paragraphs from {len(filing_paragraphs):,} filings\n")
# ─────────────────────────────────────────────────────────────────────────────
# Step 1: Identify filing generators (500 random HTML files)
# ─────────────────────────────────────────────────────────────────────────────
print("=" * 80)
print("STEP 1: IDENTIFY FILING GENERATORS (500-file sample)")
print("=" * 80)
all_html_files = sorted(HTML_DIR.glob("*.html"))
sample_files = random.sample(all_html_files, min(500, len(all_html_files)))
sample_generators = {} # filename_stem -> generator
raw_generator_strings = []
for f in sample_files:
try:
with open(f, "rb") as fh:
header = fh.read(5000)
raw = extract_generator(header)
raw_generator_strings.append(raw)
gen = normalize_generator(raw)
sample_generators[f.stem] = gen
except Exception:
sample_generators[f.stem] = "ERROR"
gen_counts = Counter(sample_generators.values())
print(f"\nGenerator distribution (500-file sample):\n")
print(f" {'Generator':<30} {'Count':>6} {'%':>7}")
print(f" {'-'*30} {'-'*6} {'-'*7}")
for gen, count in gen_counts.most_common():
print(f" {gen:<30} {count:>6} {count/5:.1f}%")
print(f"\nRaw generator strings (unique):")
raw_counts = Counter(raw_generator_strings)
for raw, count in raw_counts.most_common(20):
print(f" [{count:>4}] {raw[:80]}")
# ─────────────────────────────────────────────────────────────────────────────
# Step 2: Generator-specific quality metrics
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("STEP 2: GENERATOR-SPECIFIC QUALITY METRICS")
print("=" * 80)
# Major generators: those with >20 filings in sample
major_gens = {g for g, c in gen_counts.items() if c > 20}
print(f"\nMajor generators (>20 in sample): {sorted(major_gens)}\n")
# For each sampled filing that has paragraphs, compute metrics
gen_metrics = defaultdict(lambda: {
"filing_count": 0,
"para_counts": [],
"word_counts": [],
"lowercase_starts": 0,
"total_paras": 0,
"short_paras": 0, # <25 words
"html_sizes": [],
"text_sizes": [],
})
for stem, gen in sample_generators.items():
if gen not in major_gens:
continue
acc = stem # filename stem is the accession number
paras = filing_paragraphs.get(acc, [])
m = gen_metrics[gen]
m["filing_count"] += 1
m["para_counts"].append(len(paras))
# HTML file size
html_path = HTML_DIR / f"{stem}.html"
try:
html_size = html_path.stat().st_size
except Exception:
html_size = 0
m["html_sizes"].append(html_size)
total_text_len = 0
for p in paras:
wc = p.get("wordCount", len(p["text"].split()))
m["word_counts"].append(wc)
m["total_paras"] += 1
total_text_len += len(p["text"])
if p["text"] and p["text"][0].islower():
m["lowercase_starts"] += 1
if wc < 25:
m["short_paras"] += 1
m["text_sizes"].append(total_text_len)
# Print table
print(f" {'Generator':<22} {'Files':>5} {'Avg ¶':>7} {'Avg WC':>7} {'%lc':>6} {'%short':>7} {'ExtRatio':>9}")
print(f" {'-'*22} {'-'*5} {'-'*7} {'-'*7} {'-'*6} {'-'*7} {'-'*9}")
for gen in sorted(major_gens):
m = gen_metrics[gen]
n = m["filing_count"]
if n == 0:
continue
avg_paras = sum(m["para_counts"]) / n if n else 0
avg_wc = sum(m["word_counts"]) / len(m["word_counts"]) if m["word_counts"] else 0
pct_lc = (m["lowercase_starts"] / m["total_paras"] * 100) if m["total_paras"] else 0
pct_short = (m["short_paras"] / m["total_paras"] * 100) if m["total_paras"] else 0
# Extraction ratio: total text bytes / html bytes
total_html = sum(m["html_sizes"])
total_text = sum(m["text_sizes"])
ext_ratio = (total_text / total_html * 100) if total_html else 0
print(f" {gen:<22} {n:>5} {avg_paras:>7.1f} {avg_wc:>7.1f} {pct_lc:>5.1f}% {pct_short:>6.1f}% {ext_ratio:>8.2f}%")
# ─────────────────────────────────────────────────────────────────────────────
# Step 3: HTML structure analysis — representative snippets
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("STEP 3: HTML STRUCTURE ANALYSIS (paragraph encoding by generator)")
print("=" * 80)
top5_gens = [g for g, _ in gen_counts.most_common(5)]
for gen in top5_gens:
# Find a sample file for this generator
sample_acc = None
for stem, g in sample_generators.items():
if g == gen:
sample_acc = stem
break
if not sample_acc:
continue
html_path = HTML_DIR / f"{sample_acc}.html"
try:
with open(html_path, "r", errors="replace") as fh:
content = fh.read(50000) # read enough to find a paragraph
# Find a <p> tag or similar paragraph structure
# Look for a <p tag with content
m = re.search(r'(<p\b[^>]*>[^<]{20,})', content, re.IGNORECASE)
if m:
snippet = m.group(1)[:200]
else:
# Try <div> or <span> with text
m = re.search(r'(<(?:div|span)\b[^>]*>[^<]{20,})', content, re.IGNORECASE)
if m:
snippet = m.group(1)[:200]
else:
snippet = "(no paragraph tag found in first 50KB)"
except Exception as e:
snippet = f"(error: {e})"
print(f"\n Generator: {gen}")
print(f" File: {sample_acc}.html")
print(f" Snippet: {snippet}")
print()
# ─────────────────────────────────────────────────────────────────────────────
# Step 4: Generator fingerprinting of problem paragraphs
# ─────────────────────────────────────────────────────────────────────────────
print("=" * 80)
print("STEP 4: GENERATOR FINGERPRINTING OF PROBLEM PARAGRAPHS")
print("=" * 80)
# Identify problem paragraphs
lowercase_paras = []
long_paras = [] # >300 words
short_paras = [] # <25 words
for p in paragraphs:
wc = p.get("wordCount", len(p["text"].split()))
if p["text"] and p["text"][0].islower():
lowercase_paras.append(p)
if wc > 300:
long_paras.append(p)
if wc < 25:
short_paras.append(p)
print(f"\n Problem paragraph counts:")
print(f" Lowercase starts: {len(lowercase_paras):,}")
print(f" Long (>300 words): {len(long_paras):,}")
print(f" Short (<25 words): {len(short_paras):,}")
print(f" Total paragraphs: {len(paragraphs):,}")
# For each category, sample up to 200 and look up generators
# We need a cache of accession -> generator since we may need to read many files
print("\n Building generator cache for problem filings...")
problem_accessions = set()
for p in lowercase_paras:
problem_accessions.add(p["filing"]["accessionNumber"])
for p in long_paras:
problem_accessions.add(p["filing"]["accessionNumber"])
for p in short_paras:
problem_accessions.add(p["filing"]["accessionNumber"])
# Also get generators for ALL filings to compute baseline
print(" Reading generators for ALL filings in the corpus...")
all_accessions = set(filing_paragraphs.keys())
acc_generator = {}
for acc in all_accessions:
html_path = HTML_DIR / f"{acc}.html"
if html_path.exists():
acc_generator[acc] = read_generator_for_file(html_path)
else:
acc_generator[acc] = "FILE_MISSING"
# Baseline distribution
baseline_gen_counts = Counter(acc_generator.values())
print(f"\n Full corpus generator distribution ({len(acc_generator):,} filings):\n")
print(f" {'Generator':<30} {'Count':>6} {'%':>7}")
print(f" {'-'*30} {'-'*6} {'-'*7}")
total_filings = len(acc_generator)
for gen, count in baseline_gen_counts.most_common(15):
print(f" {gen:<30} {count:>6} {count/total_filings*100:>6.1f}%")
def analyze_problem_category(name, problem_list, acc_generator, baseline_gen_counts, total_filings):
"""Analyze which generators are over-represented in a problem category."""
print(f"\n --- {name} ({len(problem_list):,} paragraphs) ---")
# Count generators for problem paragraphs (by paragraph, not by filing)
gen_para_counts = Counter()
for p in problem_list:
acc = p["filing"]["accessionNumber"]
gen = acc_generator.get(acc, "UNKNOWN")
gen_para_counts[gen] += 1
total_problem = len(problem_list)
total_all = len(paragraphs)
print(f" {'Generator':<30} {'# Problem':>9} {'% of Prob':>9} {'% of All':>9} {'Over-rep':>9}")
print(f" {'-'*30} {'-'*9} {'-'*9} {'-'*9} {'-'*9}")
# Compute total paragraphs per generator
gen_all_para_counts = Counter()
for p in paragraphs:
acc = p["filing"]["accessionNumber"]
gen = acc_generator.get(acc, "UNKNOWN")
gen_all_para_counts[gen] += 1
for gen, prob_count in gen_para_counts.most_common(10):
pct_of_problem = prob_count / total_problem * 100 if total_problem else 0
all_count = gen_all_para_counts.get(gen, 1)
pct_of_all = all_count / total_all * 100 if total_all else 0
over_rep = pct_of_problem / pct_of_all if pct_of_all else 0
print(f" {gen:<30} {prob_count:>9,} {pct_of_problem:>8.1f}% {pct_of_all:>8.1f}% {over_rep:>8.2f}x")
# Show a few example problem texts
print(f"\n Example texts:")
for p in problem_list[:3]:
text = p["text"][:120].replace("\n", " ")
acc = p["filing"]["accessionNumber"]
gen = acc_generator.get(acc, "?")
print(f" [{gen}] {text}...")
analyze_problem_category("Lowercase starts (orphan words)", lowercase_paras, acc_generator, baseline_gen_counts, total_filings)
analyze_problem_category("Long paragraphs (>300 words, potential merges)", long_paras, acc_generator, baseline_gen_counts, total_filings)
analyze_problem_category("Short paragraphs (<25 words, potential fragments)", short_paras, acc_generator, baseline_gen_counts, total_filings)
# ─────────────────────────────────────────────────────────────────────────────
# Step 5: Filing size vs extraction quality
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("STEP 5: FILING SIZE vs EXTRACTION QUALITY")
print("=" * 80)
# Compute HTML size and paragraph count for all filings
size_para_data = []
for acc, paras_list in filing_paragraphs.items():
html_path = HTML_DIR / f"{acc}.html"
try:
html_size = html_path.stat().st_size
except Exception:
continue
size_para_data.append({
"acc": acc,
"html_size": html_size,
"para_count": len(paras_list),
"generator": acc_generator.get(acc, "UNKNOWN"),
})
# Bin by size ranges
size_bins = [
(0, 50_000, "<50KB"),
(50_000, 200_000, "50-200KB"),
(200_000, 500_000, "200-500KB"),
(500_000, 1_000_000, "500KB-1MB"),
(1_000_000, 5_000_000, "1-5MB"),
(5_000_000, float("inf"), ">5MB"),
]
print(f"\n HTML Size vs Extracted Paragraphs:\n")
print(f" {'Size Range':<15} {'Files':>6} {'Avg ¶':>7} {'Med ¶':>7} {'Min ¶':>6} {'Max ¶':>6}")
print(f" {'-'*15} {'-'*6} {'-'*7} {'-'*7} {'-'*6} {'-'*6}")
for lo, hi, label in size_bins:
in_bin = [d for d in size_para_data if lo <= d["html_size"] < hi]
if not in_bin:
continue
counts = sorted([d["para_count"] for d in in_bin])
avg = sum(counts) / len(counts)
med = counts[len(counts) // 2]
print(f" {label:<15} {len(in_bin):>6} {avg:>7.1f} {med:>7} {min(counts):>6} {max(counts):>6}")
# Large HTML files with very few paragraphs — likely extraction failures
print(f"\n Potential extraction failures (HTML >1MB but ≤2 paragraphs):\n")
big_few = [d for d in size_para_data if d["html_size"] > 1_000_000 and d["para_count"] <= 2]
big_few.sort(key=lambda d: d["html_size"], reverse=True)
if not big_few:
# Relax threshold
print(" (None found with >1MB and ≤2 paragraphs. Relaxing to >500KB and ≤3 paragraphs)\n")
big_few = [d for d in size_para_data if d["html_size"] > 500_000 and d["para_count"] <= 3]
big_few.sort(key=lambda d: d["html_size"], reverse=True)
print(f" {'Accession':<30} {'HTML Size':>12} {'Paras':>6} {'Generator':<25}")
print(f" {'-'*30} {'-'*12} {'-'*6} {'-'*25}")
for d in big_few[:10]:
size_str = f"{d['html_size']/1024/1024:.2f} MB" if d['html_size'] > 1_000_000 else f"{d['html_size']/1024:.0f} KB"
print(f" {d['acc']:<30} {size_str:>12} {d['para_count']:>6} {d['generator']:<25}")
# Also show the reverse: small HTML with many paragraphs
print(f"\n Unusual: Small HTML (<50KB) with many paragraphs (>15):\n")
small_many = [d for d in size_para_data if d["html_size"] < 50_000 and d["para_count"] > 15]
small_many.sort(key=lambda d: d["para_count"], reverse=True)
print(f" {'Accession':<30} {'HTML Size':>12} {'Paras':>6} {'Generator':<25}")
print(f" {'-'*30} {'-'*12} {'-'*6} {'-'*25}")
for d in small_many[:10]:
size_str = f"{d['html_size']/1024:.0f} KB"
print(f" {d['acc']:<30} {size_str:>12} {d['para_count']:>6} {d['generator']:<25}")
# ─────────────────────────────────────────────────────────────────────────────
# Summary
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print("""
Key findings are printed above. Look for:
1. Which generators dominate the corpus
2. Whether any generator has notably worse extraction metrics (low para count,
high % lowercase starts, low extraction ratio)
3. Whether problem paragraphs cluster around specific generators (over-rep > 1.5x)
4. Whether large-HTML / few-paragraph cases cluster on a specific generator
""")