472 lines
19 KiB
Python
472 lines
19 KiB
Python
"""
|
|
Investigate whether certain SEC filing generators produce systematically worse
|
|
text extraction in the SEC-cyBERT corpus. READ-ONLY analysis.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
random.seed(42)
|
|
|
|
HTML_DIR = Path("data/raw/html")
|
|
PARAGRAPHS_FILE = Path("data/paragraphs/paragraphs-clean.jsonl")
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Helpers
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
def extract_generator(header_bytes: bytes) -> str:
|
|
"""Extract generator from first ~5KB of an HTML file."""
|
|
text = header_bytes.decode("utf-8", errors="replace")
|
|
|
|
# 1. <meta name="generator" content="...">
|
|
m = re.search(
|
|
r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
|
|
text, re.IGNORECASE
|
|
)
|
|
if m:
|
|
return m.group(1).strip()
|
|
|
|
# Also try content before name order
|
|
m = re.search(
|
|
r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']',
|
|
text, re.IGNORECASE
|
|
)
|
|
if m:
|
|
return m.group(1).strip()
|
|
|
|
# 2. <!-- Generated by ... -->
|
|
m = re.search(r'<!--\s*Generated\s+by\s+([^->]+)', text, re.IGNORECASE)
|
|
if m:
|
|
return m.group(1).strip()
|
|
|
|
# 3. Distinctive patterns
|
|
if "Workiva" in text or "wkiva" in text.lower():
|
|
return "Workiva (pattern)"
|
|
if "ix:header" in text.lower() or "ix:hidden" in text.lower():
|
|
# iXBRL inline — common but not a specific generator
|
|
pass
|
|
if "toppanmerrill" in text.lower() or "toppan" in text.lower():
|
|
return "Toppan Merrill (pattern)"
|
|
if "donnelley" in text.lower() or "EDGAR Online" in text.lower():
|
|
return "Donnelley/EDGAR Online (pattern)"
|
|
if "GoXBRL" in text:
|
|
return "GoXBRL (pattern)"
|
|
|
|
return "UNKNOWN"
|
|
|
|
|
|
def normalize_generator(raw: str) -> str:
|
|
"""Normalize generator strings to canonical names."""
|
|
low = raw.lower()
|
|
if "workiva" in low or "wdesk" in low or "wkiva" in low:
|
|
return "Workiva"
|
|
if "toppan" in low or "merrill" in low:
|
|
return "Toppan Merrill"
|
|
if "donnelley" in low or "edgar online" in low:
|
|
return "Donnelley"
|
|
if "goxbrl" in low:
|
|
return "GoXBRL"
|
|
if "word" in low or "microsoft" in low:
|
|
return "Microsoft Word"
|
|
if "webfilings" in low:
|
|
return "WebFilings"
|
|
if "novaworks" in low:
|
|
return "Novaworks"
|
|
if "ez-xbrl" in low or "ezxbrl" in low:
|
|
return "EZ-XBRL"
|
|
if "ixbrl" in low or "inline xbrl" in low:
|
|
return "iXBRL Generator"
|
|
if "vintage" in low:
|
|
return "Vintage (Donnelley)"
|
|
if "edgar" in low:
|
|
return "EDGAR"
|
|
if raw == "UNKNOWN":
|
|
return "UNKNOWN"
|
|
return raw # keep as-is if no match
|
|
|
|
|
|
def read_generator_for_file(filepath: Path) -> str:
|
|
"""Read the first 5KB and extract the generator."""
|
|
try:
|
|
with open(filepath, "rb") as f:
|
|
header = f.read(5000)
|
|
return normalize_generator(extract_generator(header))
|
|
except Exception:
|
|
return "ERROR"
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Step 0: Load paragraphs
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("Loading paragraphs...")
|
|
paragraphs = []
|
|
filing_paragraphs = defaultdict(list) # accession -> [paragraph dicts]
|
|
with open(PARAGRAPHS_FILE) as f:
|
|
for line in f:
|
|
p = json.loads(line)
|
|
paragraphs.append(p)
|
|
acc = p["filing"]["accessionNumber"]
|
|
filing_paragraphs[acc].append(p)
|
|
|
|
print(f" Loaded {len(paragraphs):,} paragraphs from {len(filing_paragraphs):,} filings\n")
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Step 1: Identify filing generators (500 random HTML files)
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("=" * 80)
|
|
print("STEP 1: IDENTIFY FILING GENERATORS (500-file sample)")
|
|
print("=" * 80)
|
|
|
|
all_html_files = sorted(HTML_DIR.glob("*.html"))
|
|
sample_files = random.sample(all_html_files, min(500, len(all_html_files)))
|
|
|
|
sample_generators = {} # filename_stem -> generator
|
|
raw_generator_strings = []
|
|
|
|
for f in sample_files:
|
|
try:
|
|
with open(f, "rb") as fh:
|
|
header = fh.read(5000)
|
|
raw = extract_generator(header)
|
|
raw_generator_strings.append(raw)
|
|
gen = normalize_generator(raw)
|
|
sample_generators[f.stem] = gen
|
|
except Exception:
|
|
sample_generators[f.stem] = "ERROR"
|
|
|
|
gen_counts = Counter(sample_generators.values())
|
|
print(f"\nGenerator distribution (500-file sample):\n")
|
|
print(f" {'Generator':<30} {'Count':>6} {'%':>7}")
|
|
print(f" {'-'*30} {'-'*6} {'-'*7}")
|
|
for gen, count in gen_counts.most_common():
|
|
print(f" {gen:<30} {count:>6} {count/5:.1f}%")
|
|
|
|
print(f"\nRaw generator strings (unique):")
|
|
raw_counts = Counter(raw_generator_strings)
|
|
for raw, count in raw_counts.most_common(20):
|
|
print(f" [{count:>4}] {raw[:80]}")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Step 2: Generator-specific quality metrics
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("\n" + "=" * 80)
|
|
print("STEP 2: GENERATOR-SPECIFIC QUALITY METRICS")
|
|
print("=" * 80)
|
|
|
|
# Major generators: those with >20 filings in sample
|
|
major_gens = {g for g, c in gen_counts.items() if c > 20}
|
|
print(f"\nMajor generators (>20 in sample): {sorted(major_gens)}\n")
|
|
|
|
# For each sampled filing that has paragraphs, compute metrics
|
|
gen_metrics = defaultdict(lambda: {
|
|
"filing_count": 0,
|
|
"para_counts": [],
|
|
"word_counts": [],
|
|
"lowercase_starts": 0,
|
|
"total_paras": 0,
|
|
"short_paras": 0, # <25 words
|
|
"html_sizes": [],
|
|
"text_sizes": [],
|
|
})
|
|
|
|
for stem, gen in sample_generators.items():
|
|
if gen not in major_gens:
|
|
continue
|
|
|
|
acc = stem # filename stem is the accession number
|
|
paras = filing_paragraphs.get(acc, [])
|
|
|
|
m = gen_metrics[gen]
|
|
m["filing_count"] += 1
|
|
m["para_counts"].append(len(paras))
|
|
|
|
# HTML file size
|
|
html_path = HTML_DIR / f"{stem}.html"
|
|
try:
|
|
html_size = html_path.stat().st_size
|
|
except Exception:
|
|
html_size = 0
|
|
m["html_sizes"].append(html_size)
|
|
|
|
total_text_len = 0
|
|
for p in paras:
|
|
wc = p.get("wordCount", len(p["text"].split()))
|
|
m["word_counts"].append(wc)
|
|
m["total_paras"] += 1
|
|
total_text_len += len(p["text"])
|
|
|
|
if p["text"] and p["text"][0].islower():
|
|
m["lowercase_starts"] += 1
|
|
if wc < 25:
|
|
m["short_paras"] += 1
|
|
|
|
m["text_sizes"].append(total_text_len)
|
|
|
|
# Print table
|
|
print(f" {'Generator':<22} {'Files':>5} {'Avg ¶':>7} {'Avg WC':>7} {'%lc':>6} {'%short':>7} {'ExtRatio':>9}")
|
|
print(f" {'-'*22} {'-'*5} {'-'*7} {'-'*7} {'-'*6} {'-'*7} {'-'*9}")
|
|
|
|
for gen in sorted(major_gens):
|
|
m = gen_metrics[gen]
|
|
n = m["filing_count"]
|
|
if n == 0:
|
|
continue
|
|
avg_paras = sum(m["para_counts"]) / n if n else 0
|
|
avg_wc = sum(m["word_counts"]) / len(m["word_counts"]) if m["word_counts"] else 0
|
|
pct_lc = (m["lowercase_starts"] / m["total_paras"] * 100) if m["total_paras"] else 0
|
|
pct_short = (m["short_paras"] / m["total_paras"] * 100) if m["total_paras"] else 0
|
|
|
|
# Extraction ratio: total text bytes / html bytes
|
|
total_html = sum(m["html_sizes"])
|
|
total_text = sum(m["text_sizes"])
|
|
ext_ratio = (total_text / total_html * 100) if total_html else 0
|
|
|
|
print(f" {gen:<22} {n:>5} {avg_paras:>7.1f} {avg_wc:>7.1f} {pct_lc:>5.1f}% {pct_short:>6.1f}% {ext_ratio:>8.2f}%")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Step 3: HTML structure analysis — representative snippets
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("\n" + "=" * 80)
|
|
print("STEP 3: HTML STRUCTURE ANALYSIS (paragraph encoding by generator)")
|
|
print("=" * 80)
|
|
|
|
top5_gens = [g for g, _ in gen_counts.most_common(5)]
|
|
|
|
for gen in top5_gens:
|
|
# Find a sample file for this generator
|
|
sample_acc = None
|
|
for stem, g in sample_generators.items():
|
|
if g == gen:
|
|
sample_acc = stem
|
|
break
|
|
if not sample_acc:
|
|
continue
|
|
|
|
html_path = HTML_DIR / f"{sample_acc}.html"
|
|
try:
|
|
with open(html_path, "r", errors="replace") as fh:
|
|
content = fh.read(50000) # read enough to find a paragraph
|
|
|
|
# Find a <p> tag or similar paragraph structure
|
|
# Look for a <p tag with content
|
|
m = re.search(r'(<p\b[^>]*>[^<]{20,})', content, re.IGNORECASE)
|
|
if m:
|
|
snippet = m.group(1)[:200]
|
|
else:
|
|
# Try <div> or <span> with text
|
|
m = re.search(r'(<(?:div|span)\b[^>]*>[^<]{20,})', content, re.IGNORECASE)
|
|
if m:
|
|
snippet = m.group(1)[:200]
|
|
else:
|
|
snippet = "(no paragraph tag found in first 50KB)"
|
|
except Exception as e:
|
|
snippet = f"(error: {e})"
|
|
|
|
print(f"\n Generator: {gen}")
|
|
print(f" File: {sample_acc}.html")
|
|
print(f" Snippet: {snippet}")
|
|
print()
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Step 4: Generator fingerprinting of problem paragraphs
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("=" * 80)
|
|
print("STEP 4: GENERATOR FINGERPRINTING OF PROBLEM PARAGRAPHS")
|
|
print("=" * 80)
|
|
|
|
# Identify problem paragraphs
|
|
lowercase_paras = []
|
|
long_paras = [] # >300 words
|
|
short_paras = [] # <25 words
|
|
|
|
for p in paragraphs:
|
|
wc = p.get("wordCount", len(p["text"].split()))
|
|
if p["text"] and p["text"][0].islower():
|
|
lowercase_paras.append(p)
|
|
if wc > 300:
|
|
long_paras.append(p)
|
|
if wc < 25:
|
|
short_paras.append(p)
|
|
|
|
print(f"\n Problem paragraph counts:")
|
|
print(f" Lowercase starts: {len(lowercase_paras):,}")
|
|
print(f" Long (>300 words): {len(long_paras):,}")
|
|
print(f" Short (<25 words): {len(short_paras):,}")
|
|
print(f" Total paragraphs: {len(paragraphs):,}")
|
|
|
|
# For each category, sample up to 200 and look up generators
|
|
# We need a cache of accession -> generator since we may need to read many files
|
|
print("\n Building generator cache for problem filings...")
|
|
problem_accessions = set()
|
|
for p in lowercase_paras:
|
|
problem_accessions.add(p["filing"]["accessionNumber"])
|
|
for p in long_paras:
|
|
problem_accessions.add(p["filing"]["accessionNumber"])
|
|
for p in short_paras:
|
|
problem_accessions.add(p["filing"]["accessionNumber"])
|
|
|
|
# Also get generators for ALL filings to compute baseline
|
|
print(" Reading generators for ALL filings in the corpus...")
|
|
all_accessions = set(filing_paragraphs.keys())
|
|
acc_generator = {}
|
|
|
|
for acc in all_accessions:
|
|
html_path = HTML_DIR / f"{acc}.html"
|
|
if html_path.exists():
|
|
acc_generator[acc] = read_generator_for_file(html_path)
|
|
else:
|
|
acc_generator[acc] = "FILE_MISSING"
|
|
|
|
# Baseline distribution
|
|
baseline_gen_counts = Counter(acc_generator.values())
|
|
print(f"\n Full corpus generator distribution ({len(acc_generator):,} filings):\n")
|
|
print(f" {'Generator':<30} {'Count':>6} {'%':>7}")
|
|
print(f" {'-'*30} {'-'*6} {'-'*7}")
|
|
total_filings = len(acc_generator)
|
|
for gen, count in baseline_gen_counts.most_common(15):
|
|
print(f" {gen:<30} {count:>6} {count/total_filings*100:>6.1f}%")
|
|
|
|
|
|
def analyze_problem_category(name, problem_list, acc_generator, baseline_gen_counts, total_filings):
|
|
"""Analyze which generators are over-represented in a problem category."""
|
|
print(f"\n --- {name} ({len(problem_list):,} paragraphs) ---")
|
|
|
|
# Count generators for problem paragraphs (by paragraph, not by filing)
|
|
gen_para_counts = Counter()
|
|
for p in problem_list:
|
|
acc = p["filing"]["accessionNumber"]
|
|
gen = acc_generator.get(acc, "UNKNOWN")
|
|
gen_para_counts[gen] += 1
|
|
|
|
total_problem = len(problem_list)
|
|
total_all = len(paragraphs)
|
|
|
|
print(f" {'Generator':<30} {'# Problem':>9} {'% of Prob':>9} {'% of All':>9} {'Over-rep':>9}")
|
|
print(f" {'-'*30} {'-'*9} {'-'*9} {'-'*9} {'-'*9}")
|
|
|
|
# Compute total paragraphs per generator
|
|
gen_all_para_counts = Counter()
|
|
for p in paragraphs:
|
|
acc = p["filing"]["accessionNumber"]
|
|
gen = acc_generator.get(acc, "UNKNOWN")
|
|
gen_all_para_counts[gen] += 1
|
|
|
|
for gen, prob_count in gen_para_counts.most_common(10):
|
|
pct_of_problem = prob_count / total_problem * 100 if total_problem else 0
|
|
all_count = gen_all_para_counts.get(gen, 1)
|
|
pct_of_all = all_count / total_all * 100 if total_all else 0
|
|
over_rep = pct_of_problem / pct_of_all if pct_of_all else 0
|
|
print(f" {gen:<30} {prob_count:>9,} {pct_of_problem:>8.1f}% {pct_of_all:>8.1f}% {over_rep:>8.2f}x")
|
|
|
|
# Show a few example problem texts
|
|
print(f"\n Example texts:")
|
|
for p in problem_list[:3]:
|
|
text = p["text"][:120].replace("\n", " ")
|
|
acc = p["filing"]["accessionNumber"]
|
|
gen = acc_generator.get(acc, "?")
|
|
print(f" [{gen}] {text}...")
|
|
|
|
|
|
analyze_problem_category("Lowercase starts (orphan words)", lowercase_paras, acc_generator, baseline_gen_counts, total_filings)
|
|
analyze_problem_category("Long paragraphs (>300 words, potential merges)", long_paras, acc_generator, baseline_gen_counts, total_filings)
|
|
analyze_problem_category("Short paragraphs (<25 words, potential fragments)", short_paras, acc_generator, baseline_gen_counts, total_filings)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Step 5: Filing size vs extraction quality
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("\n" + "=" * 80)
|
|
print("STEP 5: FILING SIZE vs EXTRACTION QUALITY")
|
|
print("=" * 80)
|
|
|
|
# Compute HTML size and paragraph count for all filings
|
|
size_para_data = []
|
|
for acc, paras_list in filing_paragraphs.items():
|
|
html_path = HTML_DIR / f"{acc}.html"
|
|
try:
|
|
html_size = html_path.stat().st_size
|
|
except Exception:
|
|
continue
|
|
size_para_data.append({
|
|
"acc": acc,
|
|
"html_size": html_size,
|
|
"para_count": len(paras_list),
|
|
"generator": acc_generator.get(acc, "UNKNOWN"),
|
|
})
|
|
|
|
# Bin by size ranges
|
|
size_bins = [
|
|
(0, 50_000, "<50KB"),
|
|
(50_000, 200_000, "50-200KB"),
|
|
(200_000, 500_000, "200-500KB"),
|
|
(500_000, 1_000_000, "500KB-1MB"),
|
|
(1_000_000, 5_000_000, "1-5MB"),
|
|
(5_000_000, float("inf"), ">5MB"),
|
|
]
|
|
|
|
print(f"\n HTML Size vs Extracted Paragraphs:\n")
|
|
print(f" {'Size Range':<15} {'Files':>6} {'Avg ¶':>7} {'Med ¶':>7} {'Min ¶':>6} {'Max ¶':>6}")
|
|
print(f" {'-'*15} {'-'*6} {'-'*7} {'-'*7} {'-'*6} {'-'*6}")
|
|
|
|
for lo, hi, label in size_bins:
|
|
in_bin = [d for d in size_para_data if lo <= d["html_size"] < hi]
|
|
if not in_bin:
|
|
continue
|
|
counts = sorted([d["para_count"] for d in in_bin])
|
|
avg = sum(counts) / len(counts)
|
|
med = counts[len(counts) // 2]
|
|
print(f" {label:<15} {len(in_bin):>6} {avg:>7.1f} {med:>7} {min(counts):>6} {max(counts):>6}")
|
|
|
|
# Large HTML files with very few paragraphs — likely extraction failures
|
|
print(f"\n Potential extraction failures (HTML >1MB but ≤2 paragraphs):\n")
|
|
big_few = [d for d in size_para_data if d["html_size"] > 1_000_000 and d["para_count"] <= 2]
|
|
big_few.sort(key=lambda d: d["html_size"], reverse=True)
|
|
|
|
if not big_few:
|
|
# Relax threshold
|
|
print(" (None found with >1MB and ≤2 paragraphs. Relaxing to >500KB and ≤3 paragraphs)\n")
|
|
big_few = [d for d in size_para_data if d["html_size"] > 500_000 and d["para_count"] <= 3]
|
|
big_few.sort(key=lambda d: d["html_size"], reverse=True)
|
|
|
|
print(f" {'Accession':<30} {'HTML Size':>12} {'Paras':>6} {'Generator':<25}")
|
|
print(f" {'-'*30} {'-'*12} {'-'*6} {'-'*25}")
|
|
for d in big_few[:10]:
|
|
size_str = f"{d['html_size']/1024/1024:.2f} MB" if d['html_size'] > 1_000_000 else f"{d['html_size']/1024:.0f} KB"
|
|
print(f" {d['acc']:<30} {size_str:>12} {d['para_count']:>6} {d['generator']:<25}")
|
|
|
|
# Also show the reverse: small HTML with many paragraphs
|
|
print(f"\n Unusual: Small HTML (<50KB) with many paragraphs (>15):\n")
|
|
small_many = [d for d in size_para_data if d["html_size"] < 50_000 and d["para_count"] > 15]
|
|
small_many.sort(key=lambda d: d["para_count"], reverse=True)
|
|
|
|
print(f" {'Accession':<30} {'HTML Size':>12} {'Paras':>6} {'Generator':<25}")
|
|
print(f" {'-'*30} {'-'*12} {'-'*6} {'-'*25}")
|
|
for d in small_many[:10]:
|
|
size_str = f"{d['html_size']/1024:.0f} KB"
|
|
print(f" {d['acc']:<30} {size_str:>12} {d['para_count']:>6} {d['generator']:<25}")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Summary
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
|
|
print("""
|
|
Key findings are printed above. Look for:
|
|
1. Which generators dominate the corpus
|
|
2. Whether any generator has notably worse extraction metrics (low para count,
|
|
high % lowercase starts, low extraction ratio)
|
|
3. Whether problem paragraphs cluster around specific generators (over-rep > 1.5x)
|
|
4. Whether large-HTML / few-paragraph cases cluster on a specific generator
|
|
""")
|