SEC-cyBERT/scripts/audit_corpus.py
2026-03-29 20:33:39 -04:00

436 lines
18 KiB
Python

#!/usr/bin/env python3
"""Audit sec-cyBERT paragraph corpus for text quality issues."""
import json
import re
import random
import os
from collections import Counter, defaultdict
from pathlib import Path
DATA_FILE = Path("data/paragraphs/paragraphs-clean.jsonl")
HTML_DIR = Path("data/raw/html")
# ── Load all paragraphs ──────────────────────────────────────────────────────
print("Loading paragraphs...")
paragraphs = []
with open(DATA_FILE) as f:
for line in f:
paragraphs.append(json.loads(line))
print(f"Loaded {len(paragraphs):,} paragraphs.\n")
def show(text, limit=200):
"""Truncate text for display."""
if len(text) <= limit:
return text
return text[:limit] + "..."
def header(title):
print("\n" + "=" * 80)
print(f" {title}")
print("=" * 80 + "\n")
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 1: Inlined headers
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 1: Inlined Headers")
inlined_header_examples = []
# Detect heading+body merged into one paragraph.
# A heading is a short (2-10 word) title-case or ALL-CAPS phrase at the start,
# immediately followed (no colon/period separator) by a sentence starting with
# a common sentence-opener like We/Our/The/As/In/This/A/An/Each/Management/For/Since/During.
pat_merged_header = re.compile(
r"^([A-Z][A-Za-z\s,&/\-\']+?)(?<![.;:!\?\)])\s+"
r"(We |Our |The |As |In |This |A |An |Each |To |Management |During |Since |For )"
)
STOP_WORDS = {"and", "of", "the", "for", "in", "to", "on", "with", "our",
"its", "an", "a", "or", "&"}
for p in paragraphs:
text = p["text"]
if len(text) < 50:
continue
m = pat_merged_header.match(text)
if not m:
continue
heading_candidate = m.group(1).strip()
words = heading_candidate.split()
if not (2 <= len(words) <= 10):
continue
# Must look like a heading: title case or all caps
is_title = all(
w[0].isupper() or w.lower() in STOP_WORDS
for w in words if w
)
is_allcaps = heading_candidate == heading_candidate.upper() and len(heading_candidate) > 5
if is_title or is_allcaps:
kind = "ALLCAPS" if is_allcaps else "TITLECASE"
inlined_header_examples.append((kind, p, heading_candidate))
print(f"Found {len(inlined_header_examples):,} paragraphs with potential inlined headers.")
print(f" - ALLCAPS pattern: {sum(1 for t,_,_ in inlined_header_examples if t=='ALLCAPS'):,}")
print(f" - TITLECASE pattern: {sum(1 for t,_,_ in inlined_header_examples if t=='TITLECASE'):,}")
print()
# Show 20 examples, mix of both types
random.seed(42)
sample = random.sample(inlined_header_examples, min(20, len(inlined_header_examples)))
for i, (kind, p, hdr) in enumerate(sample, 1):
print(f" [{i}] ({kind}) Header: \"{hdr}\" [{p['filing']['companyName'][:30]}]")
print(f" {show(p['text'])}")
print()
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 2: Sentence boundary violations
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 2: Sentence Boundary Violations")
boundary_examples = []
# word.Next — period followed immediately by uppercase letter (not abbreviations)
pat_dotcap = re.compile(r"[a-z]\.([A-Z][a-z])")
# word,Next — comma followed immediately by uppercase letter
pat_commacap = re.compile(r"[a-z],([A-Z][a-z])")
# Two words jammed: lowercase then uppercase with no space/punct
pat_jammed = re.compile(r"[a-z]{2}[A-Z][a-z]{2}")
# Common false positives for dot-cap: abbreviations, names
false_pos_dot = re.compile(
r"(?:Mr|Mrs|Ms|Dr|Jr|Sr|Inc|Corp|Ltd|Co|No|vs|St|Dept|Gen|Gov|Sec|Vol|Rev|etc|U\.S|U\.K)\."
)
for p in paragraphs:
text = p["text"]
issues = []
for m in pat_dotcap.finditer(text):
start = max(0, m.start() - 10)
context = text[start : m.end() + 10]
# skip if it's a known abbreviation
if not false_pos_dot.search(text[max(0, m.start() - 5) : m.end()]):
issues.append(("dot-cap", context))
for m in pat_commacap.finditer(text):
start = max(0, m.start() - 10)
context = text[start : m.end() + 10]
issues.append(("comma-cap", context))
if issues:
boundary_examples.append((p, issues))
print(f"Found {len(boundary_examples):,} paragraphs with sentence boundary violations.")
print()
random.seed(43)
sample = random.sample(boundary_examples, min(20, len(boundary_examples)))
for i, (p, issues) in enumerate(sample, 1):
print(f" [{i}] [{p['filing']['companyName'][:30]}]")
for kind, ctx in issues[:3]:
print(f" ({kind}) ...{ctx}...")
print(f" Full start: {show(p['text'], 150)}")
print()
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 3: Garbled / nonsensical text
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 3: Garbled / Nonsensical Text")
garbled_examples = []
# Spaced-out characters: single chars separated by spaces
pat_spaced = re.compile(r"(?:\b[a-zA-Z]\s){4,}")
for p in paragraphs:
text = p["text"]
reason = None
# Check spaced-out characters
if pat_spaced.search(text):
reason = "spaced-chars"
# Check long non-ASCII runs
non_ascii = sum(1 for c in text if ord(c) > 127)
if non_ascii > len(text) * 0.15 and len(text) > 20:
reason = f"non-ASCII ({non_ascii}/{len(text)} chars)"
# Check mostly numbers/symbols (>50% non-alpha)
alpha = sum(1 for c in text if c.isalpha())
if len(text) > 20 and alpha < len(text) * 0.4:
reason = f"low-alpha ({alpha}/{len(text)} = {alpha/len(text):.0%})"
if reason:
garbled_examples.append((reason, p))
print(f"Found {len(garbled_examples):,} potentially garbled paragraphs.")
reason_counts = Counter(r.split("(")[0].strip() for r, _ in garbled_examples)
for r, c in reason_counts.most_common():
print(f" - {r}: {c}")
print()
random.seed(44)
sample = random.sample(garbled_examples, min(10, len(garbled_examples)))
for i, (reason, p) in enumerate(sample, 1):
print(f" [{i}] ({reason}) [{p['filing']['companyName'][:30]}] wc={p['wordCount']}")
print(f" {show(p['text'], 250)}")
print()
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 4: HTML / markup artifacts
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 4: HTML / Markup Artifacts")
html_examples = []
pat_html_tag = re.compile(r"<[a-zA-Z/][^>]*>")
pat_html_entity = re.compile(r"&(?:amp|lt|gt|nbsp|quot|#\d+|#x[0-9a-fA-F]+);")
pat_xbrl = re.compile(r"\b(?:ix|us-gaap|dei|xbrli):")
pat_css = re.compile(r"(?:font-family|font-size|color:|margin:|padding:|text-align|line-height)", re.IGNORECASE)
for p in paragraphs:
text = p["text"]
reasons = []
if pat_html_tag.search(text):
reasons.append("html-tag")
if pat_html_entity.search(text):
reasons.append("html-entity")
if pat_xbrl.search(text):
reasons.append("xbrl")
if pat_css.search(text):
reasons.append("css")
if reasons:
html_examples.append((reasons, p))
print(f"Found {len(html_examples):,} paragraphs with HTML/markup artifacts.")
reason_counts = Counter()
for reasons, _ in html_examples:
for r in reasons:
reason_counts[r] += 1
for r, c in reason_counts.most_common():
print(f" - {r}: {c}")
print()
random.seed(45)
sample = random.sample(html_examples, min(10, len(html_examples)))
for i, (reasons, p) in enumerate(sample, 1):
print(f" [{i}] ({', '.join(reasons)}) [{p['filing']['companyName'][:30]}]")
print(f" {show(p['text'], 250)}")
print()
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 5: Truncated paragraphs
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 5: Truncated Paragraphs")
truncated = []
# Common abbreviations that end sentences without terminal punct being an issue
abbrevs = {"inc", "corp", "ltd", "co", "mr", "mrs", "ms", "dr", "jr", "sr",
"etc", "al", "eg", "ie", "vs", "no", "approx", "dept", "gov"}
for p in paragraphs:
text = p["text"].rstrip()
if not text:
continue
# Check if ends with terminal punctuation
last_char = text[-1]
if last_char in ".!?:;)\"'""'":
continue
# Check if it's a very short text (likely a heading)
if p["wordCount"] <= 5:
continue
# Check if last word is a common abbreviation
last_word = text.split()[-1].lower().rstrip(".,;:!?")
if last_word in abbrevs:
continue
truncated.append(p)
print(f"Found {len(truncated):,} potentially truncated paragraphs (no terminal punctuation, >5 words).")
print()
random.seed(46)
sample = random.sample(truncated, min(10, len(truncated)))
for i, p in enumerate(sample, 1):
text = p["text"]
print(f" [{i}] [{p['filing']['companyName'][:30]}] wc={p['wordCount']}")
# Show the END of the text
if len(text) > 200:
print(f" ...{text[-200:]}")
else:
print(f" {text}")
print()
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 6: Duplicate text across filings
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 6: Cross-Filing Duplicate Text")
# Group by textHash
hash_to_paras = defaultdict(list)
for p in paragraphs:
hash_to_paras[p["textHash"]].append(p)
# Find hashes that appear in multiple different filings
cross_filing_dupes = {}
for h, ps in hash_to_paras.items():
accessions = set(p["filing"]["accessionNumber"] for p in ps)
if len(accessions) > 1:
cross_filing_dupes[h] = ps
total_dupe_paragraphs = sum(len(ps) for ps in cross_filing_dupes.values())
print(f"Unique textHashes appearing in multiple filings: {len(cross_filing_dupes):,}")
print(f"Total paragraphs involved: {total_dupe_paragraphs:,}")
print()
# Sort by number of filings (most duplicated first)
sorted_dupes = sorted(cross_filing_dupes.items(), key=lambda x: len(set(p["filing"]["accessionNumber"] for p in x[1])), reverse=True)
print("Top 15 most duplicated paragraphs:")
for i, (h, ps) in enumerate(sorted_dupes[:15], 1):
accessions = set(p["filing"]["accessionNumber"] for p in ps)
companies = set(p["filing"]["companyName"] for p in ps)
print(f"\n [{i}] Hash={h}, in {len(accessions)} filings, {len(companies)} companies")
print(f" Companies: {', '.join(list(companies)[:5])}{'...' if len(companies) > 5 else ''}")
print(f" Text: {show(ps[0]['text'], 200)}")
# Check for same-company cross-year dupes vs different-company dupes
same_company_dupes = 0
diff_company_dupes = 0
for h, ps in cross_filing_dupes.items():
companies = set(p["filing"]["companyName"] for p in ps)
if len(companies) == 1:
same_company_dupes += 1
else:
diff_company_dupes += 1
print(f"\n\nBreakdown:")
print(f" Same company, different filings (likely year-over-year boilerplate): {same_company_dupes:,}")
print(f" Different companies (likely industry boilerplate or extraction error): {diff_company_dupes:,}")
# ══════════════════════════════════════════════════════════════════════════════
# CHECK 7: Ground truth spot-check
# ══════════════════════════════════════════════════════════════════════════════
header("CHECK 7: Ground Truth Spot-Check (10 random paragraphs vs. source HTML)")
def normalize_html_to_plain(html_text):
"""Convert raw HTML to normalized plain text for comparison."""
plain = re.sub(r"<[^>]+>", " ", html_text)
# Decode common HTML entities
plain = re.sub(r"&nbsp;?", " ", plain)
plain = re.sub(r"&amp;", "&", plain)
plain = re.sub(r"&lt;", "<", plain)
plain = re.sub(r"&gt;", ">", plain)
plain = re.sub(r"&rsquo;|&#8217;|&#x2019;", "\u2019", plain)
plain = re.sub(r"&lsquo;|&#8216;|&#x2018;", "\u2018", plain)
plain = re.sub(r"&rdquo;|&#8221;|&#x201D;", "\u201D", plain)
plain = re.sub(r"&ldquo;|&#8220;|&#x201C;", "\u201C", plain)
plain = re.sub(r"&mdash;|&#8212;", "\u2014", plain)
plain = re.sub(r"&ndash;|&#8211;", "\u2013", plain)
plain = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), plain)
plain = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), plain)
plain = re.sub(r"&\w+;", " ", plain)
plain = re.sub(r"\s+", " ", plain)
return plain
random.seed(99)
spot_check_sample = random.sample(paragraphs, 10)
match_count = 0
partial_count = 0
not_found_count = 0
for i, p in enumerate(spot_check_sample, 1):
acc = p["filing"]["accessionNumber"]
html_path = HTML_DIR / f"{acc}.html"
print(f" [{i}] {p['filing']['companyName'][:40]} | {acc}")
print(f" Paragraph index: {p['paragraphIndex']}, word count: {p['wordCount']}")
corpus_text = p["text"]
corpus_norm = re.sub(r"\s+", " ", corpus_text).strip()
if not html_path.exists():
print(f" *** HTML file not found: {html_path}")
print(f" Corpus text: {show(corpus_text, 150)}")
not_found_count += 1
print()
continue
with open(html_path, "r", errors="replace") as f:
html_content = f.read()
plain_html = normalize_html_to_plain(html_content)
# Check if the entire corpus text appears verbatim in the HTML plain text
if corpus_norm in plain_html:
print(f" VERBATIM MATCH: Corpus text found exactly in HTML source.")
match_count += 1
else:
# Try to find a distinctive substring to locate the paragraph
# Use multiple probes from different positions
found = False
for start_frac in [0.3, 0.5, 0.1, 0.7]:
start_pos = int(len(corpus_norm) * start_frac)
probe = corpus_norm[start_pos:start_pos + 40]
if not probe:
continue
idx = plain_html.find(probe)
if idx >= 0:
found = True
# Show surrounding context from HTML
ctx_start = max(0, idx - 80)
ctx_end = min(len(plain_html), idx + len(corpus_norm) + 80)
html_ctx = plain_html[ctx_start:ctx_end].strip()
print(f" PARTIAL MATCH: Text found in HTML but paragraph boundaries differ.")
print(f" Corpus first 120: {corpus_norm[:120]}")
print(f" HTML context 120: {html_ctx[:120]}")
partial_count += 1
break
if not found:
print(f" NOT FOUND in HTML plain text!")
print(f" Corpus text: {show(corpus_text, 150)}")
not_found_count += 1
print()
print(f"Spot-check results: {match_count} verbatim, {partial_count} partial, {not_found_count} not found")
# ══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════════════════
header("SUMMARY")
print(f"Total paragraphs: {len(paragraphs):,}")
print(f" 1. Inlined headers: {len(inlined_header_examples):,}")
print(f" 2. Sentence boundary violations: {len(boundary_examples):,}")
print(f" 3. Garbled / nonsensical text: {len(garbled_examples):,}")
print(f" 4. HTML / markup artifacts: {len(html_examples):,}")
print(f" 5. Truncated paragraphs: {len(truncated):,}")
print(f" 6. Cross-filing duplicates: {len(cross_filing_dupes):,} unique texts in {total_dupe_paragraphs:,} paragraphs")
print()