628 lines
23 KiB
Python
628 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cross-reference SEC filing generators with paragraph quality metrics.
|
|
Reuses detection logic from detect_generators.py, then computes quality
|
|
metrics per generator from paragraphs-clean.jsonl.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import statistics
|
|
from collections import defaultdict, Counter
|
|
from pathlib import Path
|
|
|
|
HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
|
|
PARAGRAPHS_FILE = Path("/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl")
|
|
READ_BYTES = 20_000
|
|
|
|
# ── Generator detection (copied from detect_generators.py) ──
|
|
|
|
FILING_AGENT_CIKS = {
|
|
"0000950170": "Donnelley Financial Solutions",
|
|
"0001193125": "Donnelley Financial Solutions",
|
|
"0001558370": "Toppan Merrill",
|
|
"0001654954": "Toppan Merrill",
|
|
}
|
|
|
|
|
|
def _normalize_generator(raw: str) -> str:
|
|
r = raw.strip().lower()
|
|
if "workiva" in r or "wdesk" in r:
|
|
return "Workiva"
|
|
if "donnelley" in r or "dfin" in r or "rrdonnelley" in r:
|
|
return "Donnelley Financial Solutions"
|
|
if ("toppan" in r) or ("merrill" in r and "bridge" in r):
|
|
return "Toppan Merrill"
|
|
if "word" in r and "microsoft" in r:
|
|
return "Microsoft Word"
|
|
if "excel" in r and "microsoft" in r:
|
|
return "Microsoft Excel"
|
|
if "thunderdome" in r:
|
|
return "ThunderDome"
|
|
if "goxbrl" in r:
|
|
return "GoXBRL"
|
|
if "compsci" in r:
|
|
return "CompSci Transform"
|
|
if "certent" in r:
|
|
return "Certent"
|
|
if "iris carbon" in r:
|
|
return "IRIS Carbon"
|
|
if "broadridge" in r or "profile" in r:
|
|
return "Broadridge PROfile"
|
|
if "sec publisher" in r:
|
|
return "SEC Publisher"
|
|
return raw.strip()
|
|
|
|
|
|
def detect_generator(filepath: str) -> str:
|
|
"""Read first 20KB and return generator name."""
|
|
with open(filepath, "rb") as f:
|
|
raw = f.read(READ_BYTES)
|
|
text = raw.decode("utf-8", errors="replace")
|
|
text_lower = text.lower()
|
|
|
|
# meta generator
|
|
m = re.search(r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
|
|
if not m:
|
|
m = re.search(r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']', text, re.I)
|
|
if m:
|
|
return _normalize_generator(m.group(1))
|
|
|
|
m = re.search(r'<meta\s+name\s*=\s*["\']Creator["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
|
|
if m:
|
|
return _normalize_generator(m.group(1))
|
|
|
|
m = re.search(r'<meta\s+name\s*=\s*["\']Producer["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
|
|
if m:
|
|
return _normalize_generator(m.group(1))
|
|
|
|
m = re.search(r'<meta\s+name\s*=\s*["\']ProgId["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
|
|
if m:
|
|
progid = m.group(1)
|
|
if "word" in progid.lower():
|
|
return "Microsoft Word"
|
|
if "excel" in progid.lower():
|
|
return "Microsoft Excel"
|
|
return _normalize_generator(progid)
|
|
|
|
# Comment signatures
|
|
if re.search(r"<!--.*Created with the Workiva Platform.*-->", text, re.I):
|
|
return "Workiva"
|
|
if re.search(r"<!--.*Copyright\s+\d{4}\s+Workiva.*-->", text, re.I):
|
|
return "Workiva"
|
|
if re.search(r"<!--.*Document created using Wdesk.*-->", text, re.I):
|
|
return "Workiva"
|
|
|
|
if re.search(r"<!--.*(?:Toppan\s*Merrill|iXBRL document created with.*Toppan).*-->", text, re.I):
|
|
return "Toppan Merrill"
|
|
if re.search(r"<!--.*Merrill\s*Bridge.*-->", text, re.I):
|
|
return "Toppan Merrill"
|
|
|
|
if re.search(r"<!--.*Donnelley Financial Solutions.*-->", text, re.I):
|
|
return "Donnelley Financial Solutions"
|
|
if re.search(r"<!--.*RR\s*Donnelley.*-->", text, re.I):
|
|
return "Donnelley Financial Solutions"
|
|
|
|
if re.search(r"<!--.*Broadridge\s+PROfile.*-->", text, re.I):
|
|
return "Broadridge PROfile"
|
|
if "broadridge" in text_lower:
|
|
return "Broadridge PROfile"
|
|
|
|
m_title = re.search(r"<title[^>]*>([^<]+)</title>", text, re.I)
|
|
title_text = m_title.group(1).strip() if m_title else ""
|
|
if "sec publisher" in text_lower or "sec publisher" in title_text.lower():
|
|
return "SEC Publisher"
|
|
|
|
m = re.search(r"<!--.*Powered by IRIS Carbon.*-->", text, re.I)
|
|
if m:
|
|
return "IRIS Carbon"
|
|
|
|
if re.search(r"<!--.*Certent\s+Disclosure\s+Management.*-->", text, re.I):
|
|
return "Certent"
|
|
if "certent" in text_lower:
|
|
return "Certent"
|
|
|
|
if re.search(r"<!--.*CompSci Resources.*-->", text, re.I):
|
|
return "CompSci Transform"
|
|
|
|
if re.search(r"<!--.*RDG Portal.*-->", text, re.I):
|
|
return "RDG Portal"
|
|
|
|
if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]:
|
|
return "PDF to EDGAR"
|
|
|
|
m = re.search(r"<!--\s*Generated\s+by\s+([^-]+?)-->", text, re.I)
|
|
if m:
|
|
val = m.group(1).strip()
|
|
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
|
|
return _normalize_generator(val)
|
|
m = re.search(r"<!--\s*Created\s+(?:by|with)\s+([^-]+?)-->", text, re.I)
|
|
if m:
|
|
val = m.group(1).strip()
|
|
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
|
|
return _normalize_generator(val)
|
|
|
|
# Keyword signatures
|
|
if re.search(r"\bwdesk\b", text_lower):
|
|
return "Workiva"
|
|
if re.search(r"\bworkiva\b", text_lower):
|
|
return "Workiva"
|
|
if re.search(r"\brrdonnelley\b", text_lower):
|
|
return "Donnelley Financial Solutions"
|
|
if re.search(r"\bedgar-online\b", text_lower):
|
|
return "Donnelley Financial Solutions"
|
|
if re.search(r"\btoppan\b", text_lower):
|
|
return "Toppan Merrill"
|
|
if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower):
|
|
return "Toppan Merrill"
|
|
if re.search(r"\bbowne\b", text_lower):
|
|
return "Toppan Merrill"
|
|
if re.search(r"\bcompsci\b", text_lower):
|
|
return "CompSci Transform"
|
|
if re.search(r"\bthunderdome\b", text_lower):
|
|
return "ThunderDome"
|
|
if re.search(r"\bgoxbrl\b", text_lower):
|
|
return "GoXBRL"
|
|
|
|
if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower):
|
|
return "Workiva"
|
|
|
|
# SGML document wrapper
|
|
has_sgml = re.search(r"<DOCUMENT>\s*\n?\s*<TYPE>", text, re.I)
|
|
if has_sgml:
|
|
m_fn = re.search(r"<FILENAME>\s*([\w\-\.]+)", text, re.I)
|
|
if m_fn:
|
|
filename = m_fn.group(1).lower()
|
|
if re.match(r"d\d+", filename):
|
|
return "Donnelley Financial Solutions"
|
|
if re.match(r"tm\d+", filename):
|
|
return "Toppan Merrill"
|
|
if re.match(r"ea\d+", filename):
|
|
return "EFiling/EDGAR Agent"
|
|
if "<!-- field: rule-page" in text_lower or "rule-page" in text_lower[:5000]:
|
|
return "Broadridge PROfile"
|
|
if "field: set; name: xdx" in text_lower:
|
|
return "EFiling XDX"
|
|
if "<!-- field:" in text_lower[:5000]:
|
|
return "EFiling/EDGAR Agent"
|
|
if re.search(r'<Center><DIV STYLE="width:8\.5in"', text):
|
|
return "Donnelley Financial Solutions"
|
|
basename = os.path.basename(filepath)
|
|
accession_prefix = basename.split("-")[0]
|
|
if accession_prefix in FILING_AGENT_CIKS:
|
|
return FILING_AGENT_CIKS[accession_prefix]
|
|
font_count = text_lower.count("<font")
|
|
if font_count > 5:
|
|
return "SGML-wrapped (legacy)"
|
|
return "SGML-wrapped (unknown)"
|
|
|
|
# Inline XBRL
|
|
has_ix_ns = "xmlns:ix=" in text_lower or "<ix:header" in text_lower
|
|
if re.search(r'<P STYLE="[^"]*font-family:Times New Roman"', text) and re.search(
|
|
r'<Center><DIV STYLE="width:8\.5in"', text
|
|
):
|
|
return "Donnelley Financial Solutions"
|
|
if title_text:
|
|
title_lower = title_text.lower()
|
|
if "workiva" in title_lower or "wdesk" in title_lower:
|
|
return "Workiva"
|
|
|
|
if has_ix_ns:
|
|
if "field: set; name: xdx" in text_lower:
|
|
return "EFiling XDX"
|
|
if "<!-- field: rule" in text_lower:
|
|
return "Broadridge PROfile"
|
|
if "<!-- field:" in text_lower[:5000]:
|
|
return "EFiling/EDGAR Agent"
|
|
basename = os.path.basename(filepath)
|
|
accession_prefix = basename.split("-")[0]
|
|
if accession_prefix in FILING_AGENT_CIKS:
|
|
return FILING_AGENT_CIKS[accession_prefix]
|
|
if '<?xml version="1.0" encoding="utf-8"' in text_lower[:200]:
|
|
return "Inline XBRL (utf-8 toolchain)"
|
|
if "<?xml version='1.0' encoding='ascii'?>" in text_lower[:200]:
|
|
return "Inline XBRL (SEC/EDGAR standard)"
|
|
return "Inline XBRL (tool unresolved)"
|
|
|
|
# Structural fallbacks
|
|
font_count = text_lower.count("<font")
|
|
td_count = text_lower.count("<td")
|
|
span_count = text_lower.count("<span")
|
|
|
|
if font_count > 20:
|
|
return "Legacy generator (font-based)"
|
|
if td_count > 50 and span_count < 10:
|
|
return "Table-based generator"
|
|
data_attr_count = len(re.findall(r"\bdata-\w+", text_lower))
|
|
if data_attr_count > 10:
|
|
return "Modern web tooling"
|
|
return "Unknown"
|
|
|
|
|
|
# ── Consolidate to ~14 families ──
|
|
|
|
FAMILY_MAP = {
|
|
"Workiva": "Workiva",
|
|
"Donnelley Financial Solutions": "Donnelley Financial Solutions",
|
|
"Toppan Merrill": "Toppan Merrill",
|
|
"CompSci Transform": "CompSci Transform",
|
|
"ThunderDome": "ThunderDome",
|
|
"EFiling/EDGAR Agent": "EFiling/EDGAR Agent",
|
|
"EFiling XDX": "EFiling/EDGAR Agent",
|
|
"Broadridge PROfile": "Broadridge PROfile",
|
|
"SEC Publisher": "SEC Publisher",
|
|
"IRIS Carbon": "IRIS Carbon",
|
|
"RDG Portal": "RDG Portal",
|
|
"Certent": "Certent",
|
|
"PDF to EDGAR": "PDF to EDGAR",
|
|
"GoXBRL": "GoXBRL",
|
|
"Microsoft Word": "Microsoft Word",
|
|
"Microsoft Excel": "Microsoft Excel",
|
|
"Inline XBRL (SEC/EDGAR standard)": "Inline XBRL (unattributed)",
|
|
"Inline XBRL (utf-8 toolchain)": "Inline XBRL (unattributed)",
|
|
"Inline XBRL (tool unresolved)": "Inline XBRL (unattributed)",
|
|
"SGML-wrapped (legacy)": "SGML-wrapped (unattributed)",
|
|
"SGML-wrapped (unknown)": "SGML-wrapped (unattributed)",
|
|
"Legacy generator (font-based)": "Other/Legacy",
|
|
"Table-based generator": "Other/Legacy",
|
|
"Modern web tooling": "Other/Legacy",
|
|
"Unknown": "Unknown",
|
|
}
|
|
|
|
|
|
# ── Quality metric helpers ──
|
|
|
|
# Common non-heading start words to exclude from title-case detection
|
|
NON_HEADING_STARTS = {
|
|
"we", "our", "the", "in", "a", "an", "as", "to", "on", "at", "by",
|
|
"for", "it", "is", "if", "or", "no", "so", "do", "its", "this",
|
|
"that", "with", "from", "has", "had", "have", "will", "may", "can",
|
|
"all", "any", "are", "was", "were", "been", "not", "but", "each",
|
|
"such", "these", "those", "also", "when", "there", "their",
|
|
"they", "them", "than", "who", "what", "how", "where",
|
|
}
|
|
|
|
# Section name fragments for Item 1C
|
|
SECTION_KEYWORDS = [
|
|
"risk management", "board oversight", "governance", "incident",
|
|
"strategy", "third party", "management role", "cybersecurity",
|
|
"risk factors", "material", "overview",
|
|
]
|
|
|
|
RE_ALLCAPS_HEADER = re.compile(r"^[A-Z][A-Z\s,&\-]{10,}[a-z]")
|
|
|
|
def is_inlined_header(text: str) -> bool:
|
|
"""Check if paragraph starts with an inlined heading pattern."""
|
|
# ALL-CAPS header followed by body text
|
|
if RE_ALLCAPS_HEADER.match(text):
|
|
return True
|
|
|
|
# Title-case heading: 2+ consecutive capitalized words at start (not common sentence starters)
|
|
words = text.split()
|
|
if len(words) < 4:
|
|
return False
|
|
cap_count = 0
|
|
for w in words:
|
|
clean = w.strip(".,;:!?()\"'")
|
|
if not clean:
|
|
continue
|
|
if clean[0].isupper() and clean.lower() not in NON_HEADING_STARTS:
|
|
cap_count += 1
|
|
else:
|
|
break
|
|
if cap_count >= 2:
|
|
# Check rest of text continues as a sentence (not just a short title)
|
|
remaining = " ".join(words[cap_count:])
|
|
if len(remaining) > 20:
|
|
return True
|
|
|
|
# Section keyword match at start
|
|
text_lower = text[:80].lower()
|
|
for kw in SECTION_KEYWORDS:
|
|
if text_lower.startswith(kw):
|
|
# Must have more text after the heading
|
|
if len(text) > len(kw) + 10:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def is_orphan_word(text: str) -> bool:
|
|
"""Check if paragraph starts with lowercase (excluding list patterns)."""
|
|
if not text:
|
|
return False
|
|
first_char = text[0]
|
|
if not first_char.islower():
|
|
return False
|
|
# Exclude list pattern starters
|
|
list_starters = ["and ", "or ", "including ", "i.e.", "e.g."]
|
|
text_lower = text[:15].lower()
|
|
for starter in list_starters:
|
|
if text_lower.startswith(starter):
|
|
return False
|
|
# Exclude bullet-like patterns
|
|
if text[0] in "•·-–—":
|
|
return False
|
|
return True
|
|
|
|
|
|
RE_TERMINAL = re.compile(r'[.!?;")]\s*$')
|
|
|
|
def is_truncated(text: str) -> bool:
|
|
"""Paragraph NOT ending with terminal punctuation."""
|
|
return not RE_TERMINAL.search(text)
|
|
|
|
|
|
def is_fragment(text: str) -> bool:
|
|
return len(text.split()) < 25
|
|
|
|
|
|
def main():
|
|
# ── Step 1: Detect generators for all HTML files ──
|
|
print("Step 1: Detecting generators for all HTML files...", file=sys.stderr)
|
|
accession_to_generator = {}
|
|
files = sorted(HTML_DIR.glob("*.html"))
|
|
for i, fp in enumerate(files):
|
|
accession = fp.stem
|
|
gen_raw = detect_generator(str(fp))
|
|
gen_family = FAMILY_MAP.get(gen_raw, gen_raw)
|
|
accession_to_generator[accession] = gen_family
|
|
if (i + 1) % 3000 == 0:
|
|
print(f" {i+1}/{len(files)} files processed...", file=sys.stderr)
|
|
print(f" Done: {len(files)} files, {len(set(accession_to_generator.values()))} generator families", file=sys.stderr)
|
|
|
|
# ── Step 2: Load paragraphs and compute per-filing stats ──
|
|
print("Step 2: Loading paragraphs...", file=sys.stderr)
|
|
|
|
# Per-filing data
|
|
filing_paragraphs = defaultdict(list) # accession -> list of paragraph dicts
|
|
text_hash_counts = Counter() # textHash -> count of filings containing it
|
|
|
|
# First pass: collect all textHashes and their filing counts
|
|
text_hash_filings = defaultdict(set) # textHash -> set of accessions
|
|
all_paragraphs = []
|
|
|
|
with open(PARAGRAPHS_FILE) as f:
|
|
for line in f:
|
|
p = json.loads(line)
|
|
acc = p["filing"]["accessionNumber"]
|
|
all_paragraphs.append(p)
|
|
filing_paragraphs[acc].append(p)
|
|
text_hash_filings[p["textHash"]].add(acc)
|
|
|
|
print(f" {len(all_paragraphs)} paragraphs across {len(filing_paragraphs)} filings", file=sys.stderr)
|
|
|
|
# Boilerplate: textHash appearing in 3+ filings
|
|
boilerplate_hashes = {h for h, accs in text_hash_filings.items() if len(accs) >= 3}
|
|
print(f" {len(boilerplate_hashes)} boilerplate hashes (in 3+ filings)", file=sys.stderr)
|
|
|
|
# ── Step 3: Compute metrics per generator ──
|
|
print("Step 3: Computing metrics...", file=sys.stderr)
|
|
|
|
# Per-generator aggregate
|
|
gen_stats = defaultdict(lambda: {
|
|
"total_paragraphs": 0,
|
|
"total_filings": 0,
|
|
"paragraphs_per_filing": [],
|
|
"word_counts": [],
|
|
"inlined_header": 0,
|
|
"orphan_word": 0,
|
|
"fragment": 0,
|
|
"truncated": 0,
|
|
"boilerplate": 0,
|
|
})
|
|
|
|
# Per-filing issue rates for "most problematic" analysis
|
|
filing_issue_rates = {} # accession -> {metrics..., combined_rate}
|
|
|
|
# Filings not in HTML dir (no generator detected)
|
|
missing_gen = 0
|
|
|
|
for acc, paragraphs in filing_paragraphs.items():
|
|
gen = accession_to_generator.get(acc)
|
|
if gen is None:
|
|
missing_gen += 1
|
|
gen = "(no HTML file)"
|
|
|
|
stats = gen_stats[gen]
|
|
stats["total_filings"] += 1
|
|
stats["total_paragraphs"] += len(paragraphs)
|
|
stats["paragraphs_per_filing"].append(len(paragraphs))
|
|
|
|
# Per-filing counters for issue rate
|
|
f_inlined = 0
|
|
f_orphan = 0
|
|
f_fragment = 0
|
|
f_truncated = 0
|
|
f_boilerplate = 0
|
|
|
|
for p in paragraphs:
|
|
text = p["text"]
|
|
wc = p.get("wordCount", len(text.split()))
|
|
stats["word_counts"].append(wc)
|
|
|
|
if is_inlined_header(text):
|
|
stats["inlined_header"] += 1
|
|
f_inlined += 1
|
|
if is_orphan_word(text):
|
|
stats["orphan_word"] += 1
|
|
f_orphan += 1
|
|
if is_fragment(text):
|
|
stats["fragment"] += 1
|
|
f_fragment += 1
|
|
if is_truncated(text):
|
|
stats["truncated"] += 1
|
|
f_truncated += 1
|
|
if p["textHash"] in boilerplate_hashes:
|
|
stats["boilerplate"] += 1
|
|
f_boilerplate += 1
|
|
|
|
n = len(paragraphs)
|
|
if n > 0:
|
|
filing_issue_rates[acc] = {
|
|
"generator": gen,
|
|
"n_paragraphs": n,
|
|
"inlined_header_rate": f_inlined / n,
|
|
"orphan_word_rate": f_orphan / n,
|
|
"fragment_rate": f_fragment / n,
|
|
"truncation_rate": f_truncated / n,
|
|
"boilerplate_rate": f_boilerplate / n,
|
|
"combined_rate": (f_inlined + f_orphan + f_fragment + f_truncated + f_boilerplate) / (5 * n),
|
|
}
|
|
|
|
if missing_gen:
|
|
print(f" Note: {missing_gen} filings had no matching HTML file", file=sys.stderr)
|
|
|
|
# ── Step 4: Output ──
|
|
|
|
# Compute corpus-wide averages for flagging
|
|
corpus_total = sum(s["total_paragraphs"] for s in gen_stats.values())
|
|
corpus_inlined = sum(s["inlined_header"] for s in gen_stats.values())
|
|
corpus_orphan = sum(s["orphan_word"] for s in gen_stats.values())
|
|
corpus_fragment = sum(s["fragment"] for s in gen_stats.values())
|
|
corpus_truncated = sum(s["truncated"] for s in gen_stats.values())
|
|
corpus_boilerplate = sum(s["boilerplate"] for s in gen_stats.values())
|
|
corpus_avg_wc = statistics.mean(
|
|
wc for s in gen_stats.values() for wc in s["word_counts"]
|
|
) if corpus_total > 0 else 0
|
|
|
|
avg_rates = {
|
|
"inlined_header": corpus_inlined / corpus_total if corpus_total else 0,
|
|
"orphan_word": corpus_orphan / corpus_total if corpus_total else 0,
|
|
"fragment": corpus_fragment / corpus_total if corpus_total else 0,
|
|
"truncated": corpus_truncated / corpus_total if corpus_total else 0,
|
|
"boilerplate": corpus_boilerplate / corpus_total if corpus_total else 0,
|
|
}
|
|
|
|
print()
|
|
print("=" * 180)
|
|
print("GENERATOR QUALITY CROSS-REFERENCE: SEC-cyBERT CORPUS")
|
|
print("=" * 180)
|
|
print(f"\nCorpus totals: {corpus_total:,} paragraphs across {sum(s['total_filings'] for s in gen_stats.values()):,} filings")
|
|
print(f"Corpus averages: InlinedHdr={avg_rates['inlined_header']:.1%} Orphan={avg_rates['orphan_word']:.1%} "
|
|
f"Fragment={avg_rates['fragment']:.1%} Truncated={avg_rates['truncated']:.1%} "
|
|
f"Boilerplate={avg_rates['boilerplate']:.1%} AvgWC={corpus_avg_wc:.1f}")
|
|
print(f"(Cells marked with ** are >2x the corpus average)")
|
|
|
|
# Sort by total paragraphs descending
|
|
sorted_gens = sorted(gen_stats.items(), key=lambda x: x[1]["total_paragraphs"], reverse=True)
|
|
|
|
# Header
|
|
print()
|
|
hdr = (
|
|
f"{'Generator':<35} {'Files':>6} {'Paras':>7} {'Mean/F':>7} {'Med/F':>6} "
|
|
f"{'AvgWC':>6} {'InlHdr%':>8} {'Orphan%':>8} {'Frag%':>8} {'Trunc%':>8} {'Boiler%':>8}"
|
|
)
|
|
print(hdr)
|
|
print("-" * len(hdr))
|
|
|
|
for gen, s in sorted_gens:
|
|
n = s["total_paragraphs"]
|
|
if n == 0:
|
|
continue
|
|
nf = s["total_filings"]
|
|
mean_ppf = n / nf if nf else 0
|
|
med_ppf = statistics.median(s["paragraphs_per_filing"]) if s["paragraphs_per_filing"] else 0
|
|
avg_wc = statistics.mean(s["word_counts"]) if s["word_counts"] else 0
|
|
|
|
inl_r = s["inlined_header"] / n
|
|
orp_r = s["orphan_word"] / n
|
|
fra_r = s["fragment"] / n
|
|
tru_r = s["truncated"] / n
|
|
boi_r = s["boilerplate"] / n
|
|
|
|
# Flag if >2x corpus average
|
|
def fmt_rate(val, avg_key):
|
|
pct = f"{val:.1%}"
|
|
if avg_rates[avg_key] > 0 and val > 2 * avg_rates[avg_key]:
|
|
return f"{pct:>6}**"
|
|
return f"{pct:>8}"
|
|
|
|
row = (
|
|
f"{gen:<35} {nf:>6} {n:>7} {mean_ppf:>7.1f} {med_ppf:>6.0f} "
|
|
f"{avg_wc:>6.1f} {fmt_rate(inl_r, 'inlined_header')} {fmt_rate(orp_r, 'orphan_word')} "
|
|
f"{fmt_rate(fra_r, 'fragment')} {fmt_rate(tru_r, 'truncated')} {fmt_rate(boi_r, 'boilerplate')}"
|
|
)
|
|
print(row)
|
|
|
|
print("-" * len(hdr))
|
|
|
|
# Corpus average row
|
|
corpus_med_ppf = statistics.median(
|
|
ppf for s in gen_stats.values() for ppf in s["paragraphs_per_filing"]
|
|
)
|
|
corpus_mean_ppf = corpus_total / sum(s["total_filings"] for s in gen_stats.values())
|
|
print(
|
|
f"{'CORPUS AVERAGE':<35} "
|
|
f"{sum(s['total_filings'] for s in gen_stats.values()):>6} "
|
|
f"{corpus_total:>7} "
|
|
f"{corpus_mean_ppf:>7.1f} {corpus_med_ppf:>6.0f} "
|
|
f"{corpus_avg_wc:>6.1f} "
|
|
f"{avg_rates['inlined_header']:>7.1%} "
|
|
f"{avg_rates['orphan_word']:>7.1%} "
|
|
f"{avg_rates['fragment']:>7.1%} "
|
|
f"{avg_rates['truncated']:>7.1%} "
|
|
f"{avg_rates['boilerplate']:>7.1%}"
|
|
)
|
|
|
|
# ── 10 Most Problematic Filings ──
|
|
print()
|
|
print("=" * 180)
|
|
print("10 MOST PROBLEMATIC FILINGS (highest combined issue rate across all 5 metrics)")
|
|
print("=" * 180)
|
|
|
|
# Only consider filings with at least 3 paragraphs to avoid noisy tiny filings
|
|
eligible = {acc: fr for acc, fr in filing_issue_rates.items() if fr["n_paragraphs"] >= 3}
|
|
worst = sorted(eligible.items(), key=lambda x: x[1]["combined_rate"], reverse=True)[:10]
|
|
|
|
print()
|
|
hdr2 = (
|
|
f"{'Accession':<30} {'Generator':<35} {'Paras':>5} "
|
|
f"{'InlHdr':>7} {'Orphan':>7} {'Frag':>7} {'Trunc':>7} {'Boiler':>7} {'Combined':>8}"
|
|
)
|
|
print(hdr2)
|
|
print("-" * len(hdr2))
|
|
for acc, fr in worst:
|
|
print(
|
|
f"{acc:<30} {fr['generator']:<35} {fr['n_paragraphs']:>5} "
|
|
f"{fr['inlined_header_rate']:>6.1%} {fr['orphan_word_rate']:>6.1%} "
|
|
f"{fr['fragment_rate']:>6.1%} {fr['truncation_rate']:>6.1%} "
|
|
f"{fr['boilerplate_rate']:>6.1%} {fr['combined_rate']:>7.1%}"
|
|
)
|
|
|
|
# ── Per-metric worst generators summary ──
|
|
print()
|
|
print("=" * 180)
|
|
print("GENERATORS >2x CORPUS AVERAGE (flagged metrics)")
|
|
print("=" * 180)
|
|
|
|
metric_names = {
|
|
"inlined_header": "Inlined Header",
|
|
"orphan_word": "Orphan Word",
|
|
"fragment": "Fragment",
|
|
"truncated": "Truncation",
|
|
"boilerplate": "Boilerplate",
|
|
}
|
|
|
|
for metric_key, metric_label in metric_names.items():
|
|
flagged = []
|
|
for gen, s in sorted_gens:
|
|
n = s["total_paragraphs"]
|
|
if n < 10:
|
|
continue
|
|
rate = s[metric_key] / n
|
|
if avg_rates[metric_key] > 0 and rate > 2 * avg_rates[metric_key]:
|
|
flagged.append((gen, rate, s[metric_key], n))
|
|
if flagged:
|
|
print(f"\n {metric_label} rate (corpus avg: {avg_rates[metric_key]:.1%}, threshold >2x = {2*avg_rates[metric_key]:.1%}):")
|
|
for gen, rate, count, total in sorted(flagged, key=lambda x: -x[1]):
|
|
print(f" {gen:<35} {rate:.1%} ({count}/{total})")
|
|
else:
|
|
print(f"\n {metric_label}: No generators >2x corpus average")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|