SEC-cyBERT/scripts/detect_generators.py
2026-03-29 20:33:39 -04:00

538 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Detect HTML generators for all SEC filing HTML files.
Phase 1: Exhaustive signature detection
Phase 2: Cluster remaining unknowns
Phase 3: Summary statistics
"""
import os
import re
import sys
from collections import defaultdict, Counter
from pathlib import Path
HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
READ_BYTES = 20_000
# Known SEC filing agent CIKs (accession number prefixes)
FILING_AGENT_CIKS = {
"0000950170": "Donnelley Financial Solutions",
"0001193125": "Donnelley Financial Solutions",
"0001558370": "Toppan Merrill",
"0001654954": "Toppan Merrill",
}
def detect_generator(filepath: str) -> tuple[str, str]:
"""Read first 20KB of file and detect generator. Returns (generator, evidence)."""
with open(filepath, "rb") as f:
raw = f.read(READ_BYTES)
text = raw.decode("utf-8", errors="replace")
text_lower = text.lower()
# --- Explicit generator metadata ---
# 1. <meta name="generator" content="..."> (both attribute orderings)
m = re.search(
r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
text, re.I,
)
if not m:
m = re.search(
r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']',
text, re.I,
)
if m:
return _normalize_generator(m.group(1)), f'meta generator: {m.group(1)}'
# 2. <meta name="Creator" content="...">
m = re.search(
r'<meta\s+name\s*=\s*["\']Creator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
text, re.I,
)
if m:
return _normalize_generator(m.group(1)), f'meta Creator: {m.group(1)}'
# 4. <meta name="Producer" content="...">
m = re.search(
r'<meta\s+name\s*=\s*["\']Producer["\']\s+content\s*=\s*["\']([^"\']+)["\']',
text, re.I,
)
if m:
return _normalize_generator(m.group(1)), f'meta Producer: {m.group(1)}'
# 15. ProgId meta tag (Word, Excel converters)
m = re.search(
r'<meta\s+name\s*=\s*["\']ProgId["\']\s+content\s*=\s*["\']([^"\']+)["\']',
text, re.I,
)
if m:
progid = m.group(1)
if "word" in progid.lower():
return "Microsoft Word", f"ProgId: {progid}"
if "excel" in progid.lower():
return "Microsoft Excel", f"ProgId: {progid}"
return _normalize_generator(progid), f"ProgId: {progid}"
# --- HTML comment signatures (search full 20KB) ---
# Workiva / Wdesk
if re.search(r"<!--.*Created with the Workiva Platform.*-->", text, re.I):
return "Workiva", "comment: Created with the Workiva Platform"
if re.search(r"<!--.*Copyright\s+\d{4}\s+Workiva.*-->", text, re.I):
return "Workiva", "comment: Copyright Workiva"
if re.search(r"<!--.*Document created using Wdesk.*-->", text, re.I):
return "Workiva", "comment: Document created using Wdesk"
# Toppan Merrill / Bridge
if re.search(r"<!--.*(?:Toppan\s*Merrill|iXBRL document created with.*Toppan).*-->", text, re.I):
return "Toppan Merrill", "comment: Toppan Merrill"
if re.search(r"<!--.*Merrill\s*Bridge.*-->", text, re.I):
return "Toppan Merrill", "comment: Merrill Bridge"
# Donnelley Financial Solutions / RR Donnelley
if re.search(r"<!--.*Donnelley Financial Solutions.*-->", text, re.I):
return "Donnelley Financial Solutions", "comment: Donnelley Financial Solutions"
if re.search(r"<!--.*RR\s*Donnelley.*-->", text, re.I):
return "Donnelley Financial Solutions", "comment: RR Donnelley"
# Broadridge PROfile
if re.search(r"<!--.*Broadridge\s+PROfile.*-->", text, re.I):
return "Broadridge PROfile", "comment: Broadridge PROfile"
# Also match "Licensed to: ... Document created using Broadridge PROfile"
if "broadridge" in text_lower:
return "Broadridge PROfile", "keyword: broadridge"
# SEC Publisher (in title or comment)
m_title = re.search(r"<title[^>]*>([^<]+)</title>", text, re.I)
title_text = m_title.group(1).strip() if m_title else ""
if "sec publisher" in text_lower or "sec publisher" in title_text.lower():
return "SEC Publisher", "title/keyword: SEC Publisher"
# IRIS Carbon (various filing agents using IRIS Carbon platform)
m = re.search(r"<!--.*Powered by IRIS Carbon.*-->", text, re.I)
if m:
# Extract the filing agent name before "Powered by IRIS Carbon"
m2 = re.search(r"<!--\s*([^,]+),\s*Powered by IRIS Carbon", text, re.I)
agent = m2.group(1).strip() if m2 else "Unknown agent"
return "IRIS Carbon", f"comment: {agent} via IRIS Carbon"
# Certent Disclosure Management
if re.search(r"<!--.*Certent\s+Disclosure\s+Management.*-->", text, re.I):
return "Certent", "comment: Certent Disclosure Management"
if "certent" in text_lower:
return "Certent", "keyword: certent"
# CompSci Resources, LLC
if re.search(r"<!--.*CompSci Resources.*-->", text, re.I):
return "CompSci Transform", "comment: CompSci Resources"
# RDG Portal
if re.search(r"<!--.*RDG Portal.*-->", text, re.I):
return "RDG Portal", "comment: RDG Portal"
# PDF to EDGAR
if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]:
return "PDF to EDGAR", "title/keyword: PDF to EDGAR"
# Generic generated/created by comments (but NOT bare dates)
m = re.search(r"<!--\s*Generated\s+by\s+([^-]+?)-->", text, re.I)
if m:
val = m.group(1).strip()
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
return _normalize_generator(val), f"comment: Generated by {val}"
m = re.search(r"<!--\s*Created\s+(?:by|with)\s+([^-]+?)-->", text, re.I)
if m:
val = m.group(1).strip()
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
return _normalize_generator(val), f"comment: Created by/with {val}"
# --- Keyword signatures in full text ---
# 5. Workiva
if re.search(r"\bwdesk\b", text_lower):
return "Workiva", "keyword: wdesk"
if re.search(r"\bworkiva\b", text_lower):
return "Workiva", "keyword: workiva"
# 6. Donnelley/DFIN
if re.search(r"\brrdonnelley\b", text_lower):
return "Donnelley Financial Solutions", "keyword: rrdonnelley"
if re.search(r"\bedgar-online\b", text_lower):
return "Donnelley Financial Solutions", "keyword: edgar-online"
# 7. Toppan Merrill
if re.search(r"\btoppan\b", text_lower):
return "Toppan Merrill", "keyword: toppan"
if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower):
return "Toppan Merrill", "keyword: merrill + bridge/xbrl"
if re.search(r"\bbowne\b", text_lower):
return "Toppan Merrill", "keyword: bowne"
# 8. CompSci Transform
if re.search(r"\bcompsci\b", text_lower):
return "CompSci Transform", "keyword: compsci"
# 9. ThunderDome
if re.search(r"\bthunderdome\b", text_lower):
return "ThunderDome", "keyword: thunderdome"
# 10. GoXBRL
if re.search(r"\bgoxbrl\b", text_lower):
return "GoXBRL", "keyword: goxbrl"
# 16. CSS class naming patterns
if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower):
return "Workiva", "CSS class prefix: wk_"
# --- SGML document wrapper detection ---
has_sgml = re.search(r"<DOCUMENT>\s*\n?\s*<TYPE>", text, re.I)
if has_sgml:
m_fn = re.search(r"<FILENAME>\s*([\w\-\.]+)", text, re.I)
if m_fn:
filename = m_fn.group(1).lower()
# d + digits = Donnelley Financial Solutions
if re.match(r"d\d+", filename):
return "Donnelley Financial Solutions", f"SGML filename: {m_fn.group(1)}"
# tm + digits = Toppan Merrill
if re.match(r"tm\d+", filename):
return "Toppan Merrill", f"SGML filename: {m_fn.group(1)}"
# ea + digits = EFiling/EDGAR Agent
if re.match(r"ea\d+", filename):
return "EFiling/EDGAR Agent", f"SGML filename: {m_fn.group(1)}"
# SGML-wrapped but no known filename pattern — check for other signals inside
# Rule-Page comments = Broadridge/EFiling variant
if "<!-- field: rule-page" in text_lower or "rule-page" in text_lower[:5000]:
return "Broadridge PROfile", "SGML + Rule-Page field comments"
# Field: Set comments with xdx = EFiling XDX tool
if "field: set; name: xdx" in text_lower:
return "EFiling XDX", "SGML + xdx Field:Set comments"
# <!-- Field: Set --> or <!-- Field: Rule --> without xdx
if "<!-- field:" in text_lower[:5000]:
return "EFiling/EDGAR Agent", "SGML + Field comments"
# Donnelley structural pattern: Center/DIV 8.5in
if re.search(r'<Center><DIV STYLE="width:8\.5in"', text):
return "Donnelley Financial Solutions", "SGML + Center/DIV 8.5in layout"
# Check accession prefix for known filing agents
basename = os.path.basename(filepath)
accession_prefix = basename.split("-")[0]
if accession_prefix in FILING_AGENT_CIKS:
return FILING_AGENT_CIKS[accession_prefix], f"SGML + filing agent CIK {accession_prefix}"
# Remaining SGML-wrapped: classify by structural patterns
font_count = text_lower.count("<font")
if font_count > 5:
return "SGML-wrapped (legacy/font-based)", f"SGML + {font_count} <font> tags"
return "SGML-wrapped (unknown)", "SGML wrapper, no specific generator"
# --- Inline XBRL detection for non-SGML files ---
has_ix_ns = "xmlns:ix=" in text_lower or "<ix:header" in text_lower
# 12. Structural: Donnelley uppercase P STYLE + Center DIV 8.5in
if re.search(r'<P STYLE="[^"]*font-family:Times New Roman"', text) and re.search(
r'<Center><DIV STYLE="width:8\.5in"', text
):
return "Donnelley Financial Solutions", "structural: uppercase P STYLE + Center DIV 8.5in"
# 14. Title tag tool names
if title_text:
title_lower = title_text.lower()
if "workiva" in title_lower or "wdesk" in title_lower:
return "Workiva", f"title: {title_text}"
if has_ix_ns:
# 11. ix:header with tool info / Field comments
if "field: set; name: xdx" in text_lower:
return "EFiling XDX", "iXBRL + xdx Field:Set comments"
if "<!-- field: rule" in text_lower:
return "Broadridge PROfile", "iXBRL + Rule-Page field comments"
if "<!-- field:" in text_lower[:5000]:
return "EFiling/EDGAR Agent", "iXBRL + Field comments"
# Filing agent CIK-based detection
basename = os.path.basename(filepath)
accession_prefix = basename.split("-")[0]
if accession_prefix in FILING_AGENT_CIKS:
agent = FILING_AGENT_CIKS[accession_prefix]
return f"{agent}", f"iXBRL + filing agent CIK {accession_prefix}"
# 13. XML declaration encoding as structural signal
if '<?xml version="1.0" encoding="utf-8"' in text_lower[:200]:
return "Inline XBRL (utf-8 toolchain)", "iXBRL + utf-8 XML declaration"
if "<?xml version='1.0' encoding='ascii'?>" in text_lower[:200]:
if re.search(r'<div style="display:none"><ix:header>', text_lower[:3000]):
return "Inline XBRL (SEC/EDGAR standard)", "iXBRL + ASCII XML + hidden ix:header"
return "Inline XBRL (SEC/EDGAR standard)", "iXBRL + ASCII XML declaration"
# Generic inline XBRL with no other signal
return "Inline XBRL (tool unresolved)", "iXBRL namespace only"
# --- Structural fallbacks for non-XBRL files ---
font_count = text_lower.count("<font")
td_count = text_lower.count("<td")
span_count = text_lower.count("<span")
if font_count > 20:
return "Legacy generator (font-based)", f"structural: {font_count} <font> tags"
if td_count > 50 and span_count < 10:
return "Table-based generator", f"structural: {td_count} <td> tags"
data_attr_count = len(re.findall(r"\bdata-\w+", text_lower))
if data_attr_count > 10:
return "Modern web tooling", f"structural: {data_attr_count} data- attributes"
return "Unknown", "no signature detected"
def _normalize_generator(raw: str) -> str:
"""Normalize generator names to canonical forms."""
r = raw.strip().lower()
if "workiva" in r or "wdesk" in r:
return "Workiva"
if "donnelley" in r or "dfin" in r or "rrdonnelley" in r:
return "Donnelley Financial Solutions"
if ("toppan" in r) or ("merrill" in r and "bridge" in r):
return "Toppan Merrill"
if "word" in r and "microsoft" in r:
return "Microsoft Word"
if "excel" in r and "microsoft" in r:
return "Microsoft Excel"
if "thunderdome" in r:
return "ThunderDome"
if "goxbrl" in r:
return "GoXBRL"
if "compsci" in r:
return "CompSci Transform"
if "certent" in r:
return "Certent"
if "iris carbon" in r:
return "IRIS Carbon"
if "broadridge" in r or "profile" in r:
return "Broadridge PROfile"
if "sec publisher" in r:
return "SEC Publisher"
return raw.strip()
def extract_body_snippet(filepath: str) -> str:
"""Extract first 200 bytes after <body> tag."""
with open(filepath, "rb") as f:
raw = f.read(READ_BYTES)
text = raw.decode("utf-8", errors="replace")
m = re.search(r"<body[^>]*>(.*)", text, re.I | re.S)
if m:
body = m.group(1)[:200].strip()
return re.sub(r"\s+", " ", body)
return re.sub(r"\s+", " ", text[:200])
def extract_class_names(filepath: str, max_elements: int = 10) -> list[str]:
"""Extract CSS class names from first N elements."""
with open(filepath, "rb") as f:
raw = f.read(READ_BYTES)
text = raw.decode("utf-8", errors="replace")
classes = re.findall(r'class\s*=\s*["\']([^"\']+)["\']', text, re.I)
return classes[:max_elements]
def main():
files = sorted(HTML_DIR.glob("*.html"))
total = len(files)
print(f"Processing {total} HTML files...\n")
results: dict[str, tuple[str, str]] = {}
generator_examples: dict[str, list[str]] = defaultdict(list)
generator_methods: dict[str, set[str]] = defaultdict(set)
for i, fp in enumerate(files):
accession = fp.stem
gen, evidence = detect_generator(str(fp))
results[accession] = (gen, evidence)
generator_examples[gen].append(accession)
method = evidence.split(":")[0].strip()
generator_methods[gen].add(method)
if (i + 1) % 2000 == 0:
print(f" Processed {i + 1}/{total}...", file=sys.stderr)
# --- Phase 1 output ---
print("=" * 110)
print("PHASE 1: Generator Detection Results")
print("=" * 110)
gen_counts = Counter(gen for gen, _ in results.values())
for gen, count in gen_counts.most_common():
pct = count / total * 100
examples = generator_examples[gen][:3]
methods = ", ".join(sorted(generator_methods[gen]))
print(f"\n {gen}")
print(f" Count: {count:,} ({pct:.1f}%)")
print(f" Methods: {methods}")
print(f" Examples: {', '.join(examples)}")
# --- Phase 2: Cluster unknowns ---
unknowns = [acc for acc, (gen, _) in results.items() if gen == "Unknown"]
print(f"\n\n{'=' * 110}")
print(f"PHASE 2: Clustering {len(unknowns)} Unknown Files")
print("=" * 110)
if unknowns:
fingerprints: dict[str, list[str]] = defaultdict(list)
for acc in unknowns:
fp = HTML_DIR / f"{acc}.html"
with open(fp, "rb") as f:
raw_bytes = f.read(READ_BYTES)
text = raw_bytes.decode("utf-8", errors="replace")
text_lower = text.lower()
has_xml_decl = text.startswith("<?xml")
has_doctype = "<!doctype" in text_lower[:500]
first_tag_m = re.search(r"<(\w+)", text)
first_tag = first_tag_m.group(1).lower() if first_tag_m else ""
td_c = text_lower.count("<td")
span_c = text_lower.count("<span")
div_c = text_lower.count("<div")
p_c = text_lower.count("<p ")
font_c = text_lower.count("<font")
counts = {"td": td_c, "span": span_c, "div": div_c, "p": p_c, "font": font_c}
dominant = max(counts, key=counts.get) if max(counts.values()) > 0 else "empty"
classes = re.findall(r'class\s*=\s*["\']([^"\']+)["\']', text[:5000], re.I)
class_prefix = ""
if classes:
fc = classes[0].split()[0]
if "_" in fc:
class_prefix = fc.split("_")[0] + "_"
elif "-" in fc:
class_prefix = fc.split("-")[0] + "-"
else:
class_prefix = fc[:4]
fingerprint = (
f"xml={has_xml_decl}|doctype={has_doctype}|first={first_tag}"
f"|layout={dominant}|cls={class_prefix}"
)
fingerprints[fingerprint].append(acc)
for idx, (fp_key, accs) in enumerate(
sorted(fingerprints.items(), key=lambda x: -len(x[1]))
):
print(f"\n Cluster {idx + 1} ({len(accs)} files): {fp_key}")
for acc in accs[:5]:
filepath = HTML_DIR / f"{acc}.html"
snippet = extract_body_snippet(str(filepath))
cls = extract_class_names(str(filepath), 5)
print(f" {acc}:")
print(f" Snippet: {snippet[:120]}")
if cls:
print(f" Classes: {cls[:5]}")
if len(accs) > 5:
print(f" ... and {len(accs) - 5} more files")
else:
print(" No truly unknown files remain!")
# --- Phase 3: Summary ---
print(f"\n\n{'=' * 110}")
print("PHASE 3: Summary Statistics")
print("=" * 110)
# Compute consolidated generator groups for the summary
# Group small variants under their parent
GROUP_MAP = {
"Inline XBRL (utf-8 toolchain)": "Inline XBRL (tool unresolved)",
"Inline XBRL (tool unresolved)": "Inline XBRL (tool unresolved)",
}
header = (
f"\n{'Generator':<45} {'Count':>7} {'%':>7} "
f"{'Detection Methods':<50} {'Examples (up to 3)'}"
)
print(header)
print("-" * 170)
for gen, count in gen_counts.most_common():
pct = count / total * 100
examples = ", ".join(generator_examples[gen][:3])
methods = ", ".join(sorted(generator_methods[gen]))
if len(methods) > 50:
methods = methods[:47] + "..."
print(f"{gen:<45} {count:>7} {pct:>6.1f}% {methods:<50} {examples}")
print("-" * 170)
print(f"{'TOTAL':<45} {total:>7} {100.0:>6.1f}%")
unknown_count = gen_counts.get("Unknown", 0)
identified = total - unknown_count
print(f"\nIdentified: {identified:,} / {total:,} ({identified / total * 100:.1f}%)")
print(f"Truly unidentified: {unknown_count:,} / {total:,} ({unknown_count / total * 100:.1f}%)")
# Consolidated view: group by parent tool family
print(f"\n\n{'=' * 110}")
print("CONSOLIDATED VIEW (grouped by tool family)")
print("=" * 110)
family_map = {
"Workiva": "Workiva",
"Donnelley Financial Solutions": "Donnelley Financial Solutions",
"Toppan Merrill": "Toppan Merrill",
"CompSci Transform": "CompSci Transform",
"ThunderDome": "ThunderDome",
"EFiling/EDGAR Agent": "EFiling/EDGAR Agent",
"EFiling XDX": "EFiling/EDGAR Agent",
"Broadridge PROfile": "Broadridge PROfile",
"SEC Publisher": "SEC Publisher",
"IRIS Carbon": "IRIS Carbon",
"RDG Portal": "RDG Portal",
"Certent": "Certent",
"PDF to EDGAR": "PDF to EDGAR",
"GoXBRL": "GoXBRL",
"Microsoft Word": "Microsoft Word",
"Microsoft Excel": "Microsoft Excel",
"Inline XBRL (SEC/EDGAR standard)": "Inline XBRL (unattributed)",
"Inline XBRL (utf-8 toolchain)": "Inline XBRL (unattributed)",
"Inline XBRL (tool unresolved)": "Inline XBRL (unattributed)",
"SGML-wrapped (legacy/font-based)": "SGML-wrapped (unattributed)",
"SGML-wrapped (unknown)": "SGML-wrapped (unattributed)",
"Legacy generator (font-based)": "Other/Legacy",
"Table-based generator": "Other/Legacy",
"Modern web tooling": "Other/Legacy",
"Unknown": "Unknown",
}
family_counts: Counter = Counter()
family_examples: dict[str, list[str]] = defaultdict(list)
for gen, count in gen_counts.items():
family = family_map.get(gen, gen)
family_counts[family] += count
family_examples[family].extend(generator_examples[gen][:3])
print(f"\n{'Tool Family':<45} {'Count':>7} {'%':>7}")
print("-" * 65)
for family, count in family_counts.most_common():
pct = count / total * 100
examples = ", ".join(family_examples[family][:3])
print(f"{family:<45} {count:>7} {pct:>6.1f}% {examples}")
print("-" * 65)
print(f"{'TOTAL':<45} {total:>7} {100.0:>6.1f}%")
if __name__ == "__main__":
main()