538 lines
21 KiB
Python
538 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Detect HTML generators for all SEC filing HTML files.
|
|
Phase 1: Exhaustive signature detection
|
|
Phase 2: Cluster remaining unknowns
|
|
Phase 3: Summary statistics
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import defaultdict, Counter
|
|
from pathlib import Path
|
|
|
|
HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
|
|
READ_BYTES = 20_000
|
|
|
|
# Known SEC filing agent CIKs (accession number prefixes)
|
|
FILING_AGENT_CIKS = {
|
|
"0000950170": "Donnelley Financial Solutions",
|
|
"0001193125": "Donnelley Financial Solutions",
|
|
"0001558370": "Toppan Merrill",
|
|
"0001654954": "Toppan Merrill",
|
|
}
|
|
|
|
|
|
def detect_generator(filepath: str) -> tuple[str, str]:
|
|
"""Read first 20KB of file and detect generator. Returns (generator, evidence)."""
|
|
with open(filepath, "rb") as f:
|
|
raw = f.read(READ_BYTES)
|
|
|
|
text = raw.decode("utf-8", errors="replace")
|
|
text_lower = text.lower()
|
|
|
|
# --- Explicit generator metadata ---
|
|
|
|
# 1. <meta name="generator" content="..."> (both attribute orderings)
|
|
m = re.search(
|
|
r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
|
|
text, re.I,
|
|
)
|
|
if not m:
|
|
m = re.search(
|
|
r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']',
|
|
text, re.I,
|
|
)
|
|
if m:
|
|
return _normalize_generator(m.group(1)), f'meta generator: {m.group(1)}'
|
|
|
|
# 2. <meta name="Creator" content="...">
|
|
m = re.search(
|
|
r'<meta\s+name\s*=\s*["\']Creator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
|
|
text, re.I,
|
|
)
|
|
if m:
|
|
return _normalize_generator(m.group(1)), f'meta Creator: {m.group(1)}'
|
|
|
|
# 4. <meta name="Producer" content="...">
|
|
m = re.search(
|
|
r'<meta\s+name\s*=\s*["\']Producer["\']\s+content\s*=\s*["\']([^"\']+)["\']',
|
|
text, re.I,
|
|
)
|
|
if m:
|
|
return _normalize_generator(m.group(1)), f'meta Producer: {m.group(1)}'
|
|
|
|
# 15. ProgId meta tag (Word, Excel converters)
|
|
m = re.search(
|
|
r'<meta\s+name\s*=\s*["\']ProgId["\']\s+content\s*=\s*["\']([^"\']+)["\']',
|
|
text, re.I,
|
|
)
|
|
if m:
|
|
progid = m.group(1)
|
|
if "word" in progid.lower():
|
|
return "Microsoft Word", f"ProgId: {progid}"
|
|
if "excel" in progid.lower():
|
|
return "Microsoft Excel", f"ProgId: {progid}"
|
|
return _normalize_generator(progid), f"ProgId: {progid}"
|
|
|
|
# --- HTML comment signatures (search full 20KB) ---
|
|
|
|
# Workiva / Wdesk
|
|
if re.search(r"<!--.*Created with the Workiva Platform.*-->", text, re.I):
|
|
return "Workiva", "comment: Created with the Workiva Platform"
|
|
if re.search(r"<!--.*Copyright\s+\d{4}\s+Workiva.*-->", text, re.I):
|
|
return "Workiva", "comment: Copyright Workiva"
|
|
if re.search(r"<!--.*Document created using Wdesk.*-->", text, re.I):
|
|
return "Workiva", "comment: Document created using Wdesk"
|
|
|
|
# Toppan Merrill / Bridge
|
|
if re.search(r"<!--.*(?:Toppan\s*Merrill|iXBRL document created with.*Toppan).*-->", text, re.I):
|
|
return "Toppan Merrill", "comment: Toppan Merrill"
|
|
if re.search(r"<!--.*Merrill\s*Bridge.*-->", text, re.I):
|
|
return "Toppan Merrill", "comment: Merrill Bridge"
|
|
|
|
# Donnelley Financial Solutions / RR Donnelley
|
|
if re.search(r"<!--.*Donnelley Financial Solutions.*-->", text, re.I):
|
|
return "Donnelley Financial Solutions", "comment: Donnelley Financial Solutions"
|
|
if re.search(r"<!--.*RR\s*Donnelley.*-->", text, re.I):
|
|
return "Donnelley Financial Solutions", "comment: RR Donnelley"
|
|
|
|
# Broadridge PROfile
|
|
if re.search(r"<!--.*Broadridge\s+PROfile.*-->", text, re.I):
|
|
return "Broadridge PROfile", "comment: Broadridge PROfile"
|
|
# Also match "Licensed to: ... Document created using Broadridge PROfile"
|
|
if "broadridge" in text_lower:
|
|
return "Broadridge PROfile", "keyword: broadridge"
|
|
|
|
# SEC Publisher (in title or comment)
|
|
m_title = re.search(r"<title[^>]*>([^<]+)</title>", text, re.I)
|
|
title_text = m_title.group(1).strip() if m_title else ""
|
|
if "sec publisher" in text_lower or "sec publisher" in title_text.lower():
|
|
return "SEC Publisher", "title/keyword: SEC Publisher"
|
|
|
|
# IRIS Carbon (various filing agents using IRIS Carbon platform)
|
|
m = re.search(r"<!--.*Powered by IRIS Carbon.*-->", text, re.I)
|
|
if m:
|
|
# Extract the filing agent name before "Powered by IRIS Carbon"
|
|
m2 = re.search(r"<!--\s*([^,]+),\s*Powered by IRIS Carbon", text, re.I)
|
|
agent = m2.group(1).strip() if m2 else "Unknown agent"
|
|
return "IRIS Carbon", f"comment: {agent} via IRIS Carbon"
|
|
|
|
# Certent Disclosure Management
|
|
if re.search(r"<!--.*Certent\s+Disclosure\s+Management.*-->", text, re.I):
|
|
return "Certent", "comment: Certent Disclosure Management"
|
|
if "certent" in text_lower:
|
|
return "Certent", "keyword: certent"
|
|
|
|
# CompSci Resources, LLC
|
|
if re.search(r"<!--.*CompSci Resources.*-->", text, re.I):
|
|
return "CompSci Transform", "comment: CompSci Resources"
|
|
|
|
# RDG Portal
|
|
if re.search(r"<!--.*RDG Portal.*-->", text, re.I):
|
|
return "RDG Portal", "comment: RDG Portal"
|
|
|
|
# PDF to EDGAR
|
|
if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]:
|
|
return "PDF to EDGAR", "title/keyword: PDF to EDGAR"
|
|
|
|
# Generic generated/created by comments (but NOT bare dates)
|
|
m = re.search(r"<!--\s*Generated\s+by\s+([^-]+?)-->", text, re.I)
|
|
if m:
|
|
val = m.group(1).strip()
|
|
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
|
|
return _normalize_generator(val), f"comment: Generated by {val}"
|
|
m = re.search(r"<!--\s*Created\s+(?:by|with)\s+([^-]+?)-->", text, re.I)
|
|
if m:
|
|
val = m.group(1).strip()
|
|
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
|
|
return _normalize_generator(val), f"comment: Created by/with {val}"
|
|
|
|
# --- Keyword signatures in full text ---
|
|
|
|
# 5. Workiva
|
|
if re.search(r"\bwdesk\b", text_lower):
|
|
return "Workiva", "keyword: wdesk"
|
|
if re.search(r"\bworkiva\b", text_lower):
|
|
return "Workiva", "keyword: workiva"
|
|
|
|
# 6. Donnelley/DFIN
|
|
if re.search(r"\brrdonnelley\b", text_lower):
|
|
return "Donnelley Financial Solutions", "keyword: rrdonnelley"
|
|
if re.search(r"\bedgar-online\b", text_lower):
|
|
return "Donnelley Financial Solutions", "keyword: edgar-online"
|
|
|
|
# 7. Toppan Merrill
|
|
if re.search(r"\btoppan\b", text_lower):
|
|
return "Toppan Merrill", "keyword: toppan"
|
|
if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower):
|
|
return "Toppan Merrill", "keyword: merrill + bridge/xbrl"
|
|
if re.search(r"\bbowne\b", text_lower):
|
|
return "Toppan Merrill", "keyword: bowne"
|
|
|
|
# 8. CompSci Transform
|
|
if re.search(r"\bcompsci\b", text_lower):
|
|
return "CompSci Transform", "keyword: compsci"
|
|
|
|
# 9. ThunderDome
|
|
if re.search(r"\bthunderdome\b", text_lower):
|
|
return "ThunderDome", "keyword: thunderdome"
|
|
|
|
# 10. GoXBRL
|
|
if re.search(r"\bgoxbrl\b", text_lower):
|
|
return "GoXBRL", "keyword: goxbrl"
|
|
|
|
# 16. CSS class naming patterns
|
|
if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower):
|
|
return "Workiva", "CSS class prefix: wk_"
|
|
|
|
# --- SGML document wrapper detection ---
|
|
has_sgml = re.search(r"<DOCUMENT>\s*\n?\s*<TYPE>", text, re.I)
|
|
if has_sgml:
|
|
m_fn = re.search(r"<FILENAME>\s*([\w\-\.]+)", text, re.I)
|
|
if m_fn:
|
|
filename = m_fn.group(1).lower()
|
|
# d + digits = Donnelley Financial Solutions
|
|
if re.match(r"d\d+", filename):
|
|
return "Donnelley Financial Solutions", f"SGML filename: {m_fn.group(1)}"
|
|
# tm + digits = Toppan Merrill
|
|
if re.match(r"tm\d+", filename):
|
|
return "Toppan Merrill", f"SGML filename: {m_fn.group(1)}"
|
|
# ea + digits = EFiling/EDGAR Agent
|
|
if re.match(r"ea\d+", filename):
|
|
return "EFiling/EDGAR Agent", f"SGML filename: {m_fn.group(1)}"
|
|
|
|
# SGML-wrapped but no known filename pattern — check for other signals inside
|
|
# Rule-Page comments = Broadridge/EFiling variant
|
|
if "<!-- field: rule-page" in text_lower or "rule-page" in text_lower[:5000]:
|
|
return "Broadridge PROfile", "SGML + Rule-Page field comments"
|
|
|
|
# Field: Set comments with xdx = EFiling XDX tool
|
|
if "field: set; name: xdx" in text_lower:
|
|
return "EFiling XDX", "SGML + xdx Field:Set comments"
|
|
|
|
# <!-- Field: Set --> or <!-- Field: Rule --> without xdx
|
|
if "<!-- field:" in text_lower[:5000]:
|
|
return "EFiling/EDGAR Agent", "SGML + Field comments"
|
|
|
|
# Donnelley structural pattern: Center/DIV 8.5in
|
|
if re.search(r'<Center><DIV STYLE="width:8\.5in"', text):
|
|
return "Donnelley Financial Solutions", "SGML + Center/DIV 8.5in layout"
|
|
|
|
# Check accession prefix for known filing agents
|
|
basename = os.path.basename(filepath)
|
|
accession_prefix = basename.split("-")[0]
|
|
if accession_prefix in FILING_AGENT_CIKS:
|
|
return FILING_AGENT_CIKS[accession_prefix], f"SGML + filing agent CIK {accession_prefix}"
|
|
|
|
# Remaining SGML-wrapped: classify by structural patterns
|
|
font_count = text_lower.count("<font")
|
|
if font_count > 5:
|
|
return "SGML-wrapped (legacy/font-based)", f"SGML + {font_count} <font> tags"
|
|
|
|
return "SGML-wrapped (unknown)", "SGML wrapper, no specific generator"
|
|
|
|
# --- Inline XBRL detection for non-SGML files ---
|
|
has_ix_ns = "xmlns:ix=" in text_lower or "<ix:header" in text_lower
|
|
|
|
# 12. Structural: Donnelley uppercase P STYLE + Center DIV 8.5in
|
|
if re.search(r'<P STYLE="[^"]*font-family:Times New Roman"', text) and re.search(
|
|
r'<Center><DIV STYLE="width:8\.5in"', text
|
|
):
|
|
return "Donnelley Financial Solutions", "structural: uppercase P STYLE + Center DIV 8.5in"
|
|
|
|
# 14. Title tag tool names
|
|
if title_text:
|
|
title_lower = title_text.lower()
|
|
if "workiva" in title_lower or "wdesk" in title_lower:
|
|
return "Workiva", f"title: {title_text}"
|
|
|
|
if has_ix_ns:
|
|
# 11. ix:header with tool info / Field comments
|
|
if "field: set; name: xdx" in text_lower:
|
|
return "EFiling XDX", "iXBRL + xdx Field:Set comments"
|
|
|
|
if "<!-- field: rule" in text_lower:
|
|
return "Broadridge PROfile", "iXBRL + Rule-Page field comments"
|
|
|
|
if "<!-- field:" in text_lower[:5000]:
|
|
return "EFiling/EDGAR Agent", "iXBRL + Field comments"
|
|
|
|
# Filing agent CIK-based detection
|
|
basename = os.path.basename(filepath)
|
|
accession_prefix = basename.split("-")[0]
|
|
if accession_prefix in FILING_AGENT_CIKS:
|
|
agent = FILING_AGENT_CIKS[accession_prefix]
|
|
return f"{agent}", f"iXBRL + filing agent CIK {accession_prefix}"
|
|
|
|
# 13. XML declaration encoding as structural signal
|
|
if '<?xml version="1.0" encoding="utf-8"' in text_lower[:200]:
|
|
return "Inline XBRL (utf-8 toolchain)", "iXBRL + utf-8 XML declaration"
|
|
|
|
if "<?xml version='1.0' encoding='ascii'?>" in text_lower[:200]:
|
|
if re.search(r'<div style="display:none"><ix:header>', text_lower[:3000]):
|
|
return "Inline XBRL (SEC/EDGAR standard)", "iXBRL + ASCII XML + hidden ix:header"
|
|
return "Inline XBRL (SEC/EDGAR standard)", "iXBRL + ASCII XML declaration"
|
|
|
|
# Generic inline XBRL with no other signal
|
|
return "Inline XBRL (tool unresolved)", "iXBRL namespace only"
|
|
|
|
# --- Structural fallbacks for non-XBRL files ---
|
|
font_count = text_lower.count("<font")
|
|
td_count = text_lower.count("<td")
|
|
span_count = text_lower.count("<span")
|
|
|
|
if font_count > 20:
|
|
return "Legacy generator (font-based)", f"structural: {font_count} <font> tags"
|
|
|
|
if td_count > 50 and span_count < 10:
|
|
return "Table-based generator", f"structural: {td_count} <td> tags"
|
|
|
|
data_attr_count = len(re.findall(r"\bdata-\w+", text_lower))
|
|
if data_attr_count > 10:
|
|
return "Modern web tooling", f"structural: {data_attr_count} data- attributes"
|
|
|
|
return "Unknown", "no signature detected"
|
|
|
|
|
|
def _normalize_generator(raw: str) -> str:
|
|
"""Normalize generator names to canonical forms."""
|
|
r = raw.strip().lower()
|
|
if "workiva" in r or "wdesk" in r:
|
|
return "Workiva"
|
|
if "donnelley" in r or "dfin" in r or "rrdonnelley" in r:
|
|
return "Donnelley Financial Solutions"
|
|
if ("toppan" in r) or ("merrill" in r and "bridge" in r):
|
|
return "Toppan Merrill"
|
|
if "word" in r and "microsoft" in r:
|
|
return "Microsoft Word"
|
|
if "excel" in r and "microsoft" in r:
|
|
return "Microsoft Excel"
|
|
if "thunderdome" in r:
|
|
return "ThunderDome"
|
|
if "goxbrl" in r:
|
|
return "GoXBRL"
|
|
if "compsci" in r:
|
|
return "CompSci Transform"
|
|
if "certent" in r:
|
|
return "Certent"
|
|
if "iris carbon" in r:
|
|
return "IRIS Carbon"
|
|
if "broadridge" in r or "profile" in r:
|
|
return "Broadridge PROfile"
|
|
if "sec publisher" in r:
|
|
return "SEC Publisher"
|
|
return raw.strip()
|
|
|
|
|
|
def extract_body_snippet(filepath: str) -> str:
|
|
"""Extract first 200 bytes after <body> tag."""
|
|
with open(filepath, "rb") as f:
|
|
raw = f.read(READ_BYTES)
|
|
text = raw.decode("utf-8", errors="replace")
|
|
m = re.search(r"<body[^>]*>(.*)", text, re.I | re.S)
|
|
if m:
|
|
body = m.group(1)[:200].strip()
|
|
return re.sub(r"\s+", " ", body)
|
|
return re.sub(r"\s+", " ", text[:200])
|
|
|
|
|
|
def extract_class_names(filepath: str, max_elements: int = 10) -> list[str]:
|
|
"""Extract CSS class names from first N elements."""
|
|
with open(filepath, "rb") as f:
|
|
raw = f.read(READ_BYTES)
|
|
text = raw.decode("utf-8", errors="replace")
|
|
classes = re.findall(r'class\s*=\s*["\']([^"\']+)["\']', text, re.I)
|
|
return classes[:max_elements]
|
|
|
|
|
|
def main():
|
|
files = sorted(HTML_DIR.glob("*.html"))
|
|
total = len(files)
|
|
print(f"Processing {total} HTML files...\n")
|
|
|
|
results: dict[str, tuple[str, str]] = {}
|
|
generator_examples: dict[str, list[str]] = defaultdict(list)
|
|
generator_methods: dict[str, set[str]] = defaultdict(set)
|
|
|
|
for i, fp in enumerate(files):
|
|
accession = fp.stem
|
|
gen, evidence = detect_generator(str(fp))
|
|
results[accession] = (gen, evidence)
|
|
generator_examples[gen].append(accession)
|
|
method = evidence.split(":")[0].strip()
|
|
generator_methods[gen].add(method)
|
|
|
|
if (i + 1) % 2000 == 0:
|
|
print(f" Processed {i + 1}/{total}...", file=sys.stderr)
|
|
|
|
# --- Phase 1 output ---
|
|
print("=" * 110)
|
|
print("PHASE 1: Generator Detection Results")
|
|
print("=" * 110)
|
|
|
|
gen_counts = Counter(gen for gen, _ in results.values())
|
|
|
|
for gen, count in gen_counts.most_common():
|
|
pct = count / total * 100
|
|
examples = generator_examples[gen][:3]
|
|
methods = ", ".join(sorted(generator_methods[gen]))
|
|
print(f"\n {gen}")
|
|
print(f" Count: {count:,} ({pct:.1f}%)")
|
|
print(f" Methods: {methods}")
|
|
print(f" Examples: {', '.join(examples)}")
|
|
|
|
# --- Phase 2: Cluster unknowns ---
|
|
unknowns = [acc for acc, (gen, _) in results.items() if gen == "Unknown"]
|
|
print(f"\n\n{'=' * 110}")
|
|
print(f"PHASE 2: Clustering {len(unknowns)} Unknown Files")
|
|
print("=" * 110)
|
|
|
|
if unknowns:
|
|
fingerprints: dict[str, list[str]] = defaultdict(list)
|
|
|
|
for acc in unknowns:
|
|
fp = HTML_DIR / f"{acc}.html"
|
|
with open(fp, "rb") as f:
|
|
raw_bytes = f.read(READ_BYTES)
|
|
text = raw_bytes.decode("utf-8", errors="replace")
|
|
text_lower = text.lower()
|
|
|
|
has_xml_decl = text.startswith("<?xml")
|
|
has_doctype = "<!doctype" in text_lower[:500]
|
|
first_tag_m = re.search(r"<(\w+)", text)
|
|
first_tag = first_tag_m.group(1).lower() if first_tag_m else ""
|
|
|
|
td_c = text_lower.count("<td")
|
|
span_c = text_lower.count("<span")
|
|
div_c = text_lower.count("<div")
|
|
p_c = text_lower.count("<p ")
|
|
font_c = text_lower.count("<font")
|
|
|
|
counts = {"td": td_c, "span": span_c, "div": div_c, "p": p_c, "font": font_c}
|
|
dominant = max(counts, key=counts.get) if max(counts.values()) > 0 else "empty"
|
|
|
|
classes = re.findall(r'class\s*=\s*["\']([^"\']+)["\']', text[:5000], re.I)
|
|
class_prefix = ""
|
|
if classes:
|
|
fc = classes[0].split()[0]
|
|
if "_" in fc:
|
|
class_prefix = fc.split("_")[0] + "_"
|
|
elif "-" in fc:
|
|
class_prefix = fc.split("-")[0] + "-"
|
|
else:
|
|
class_prefix = fc[:4]
|
|
|
|
fingerprint = (
|
|
f"xml={has_xml_decl}|doctype={has_doctype}|first={first_tag}"
|
|
f"|layout={dominant}|cls={class_prefix}"
|
|
)
|
|
fingerprints[fingerprint].append(acc)
|
|
|
|
for idx, (fp_key, accs) in enumerate(
|
|
sorted(fingerprints.items(), key=lambda x: -len(x[1]))
|
|
):
|
|
print(f"\n Cluster {idx + 1} ({len(accs)} files): {fp_key}")
|
|
for acc in accs[:5]:
|
|
filepath = HTML_DIR / f"{acc}.html"
|
|
snippet = extract_body_snippet(str(filepath))
|
|
cls = extract_class_names(str(filepath), 5)
|
|
print(f" {acc}:")
|
|
print(f" Snippet: {snippet[:120]}")
|
|
if cls:
|
|
print(f" Classes: {cls[:5]}")
|
|
if len(accs) > 5:
|
|
print(f" ... and {len(accs) - 5} more files")
|
|
else:
|
|
print(" No truly unknown files remain!")
|
|
|
|
# --- Phase 3: Summary ---
|
|
print(f"\n\n{'=' * 110}")
|
|
print("PHASE 3: Summary Statistics")
|
|
print("=" * 110)
|
|
|
|
# Compute consolidated generator groups for the summary
|
|
# Group small variants under their parent
|
|
GROUP_MAP = {
|
|
"Inline XBRL (utf-8 toolchain)": "Inline XBRL (tool unresolved)",
|
|
"Inline XBRL (tool unresolved)": "Inline XBRL (tool unresolved)",
|
|
}
|
|
|
|
header = (
|
|
f"\n{'Generator':<45} {'Count':>7} {'%':>7} "
|
|
f"{'Detection Methods':<50} {'Examples (up to 3)'}"
|
|
)
|
|
print(header)
|
|
print("-" * 170)
|
|
|
|
for gen, count in gen_counts.most_common():
|
|
pct = count / total * 100
|
|
examples = ", ".join(generator_examples[gen][:3])
|
|
methods = ", ".join(sorted(generator_methods[gen]))
|
|
if len(methods) > 50:
|
|
methods = methods[:47] + "..."
|
|
print(f"{gen:<45} {count:>7} {pct:>6.1f}% {methods:<50} {examples}")
|
|
|
|
print("-" * 170)
|
|
print(f"{'TOTAL':<45} {total:>7} {100.0:>6.1f}%")
|
|
|
|
unknown_count = gen_counts.get("Unknown", 0)
|
|
identified = total - unknown_count
|
|
print(f"\nIdentified: {identified:,} / {total:,} ({identified / total * 100:.1f}%)")
|
|
print(f"Truly unidentified: {unknown_count:,} / {total:,} ({unknown_count / total * 100:.1f}%)")
|
|
|
|
# Consolidated view: group by parent tool family
|
|
print(f"\n\n{'=' * 110}")
|
|
print("CONSOLIDATED VIEW (grouped by tool family)")
|
|
print("=" * 110)
|
|
|
|
family_map = {
|
|
"Workiva": "Workiva",
|
|
"Donnelley Financial Solutions": "Donnelley Financial Solutions",
|
|
"Toppan Merrill": "Toppan Merrill",
|
|
"CompSci Transform": "CompSci Transform",
|
|
"ThunderDome": "ThunderDome",
|
|
"EFiling/EDGAR Agent": "EFiling/EDGAR Agent",
|
|
"EFiling XDX": "EFiling/EDGAR Agent",
|
|
"Broadridge PROfile": "Broadridge PROfile",
|
|
"SEC Publisher": "SEC Publisher",
|
|
"IRIS Carbon": "IRIS Carbon",
|
|
"RDG Portal": "RDG Portal",
|
|
"Certent": "Certent",
|
|
"PDF to EDGAR": "PDF to EDGAR",
|
|
"GoXBRL": "GoXBRL",
|
|
"Microsoft Word": "Microsoft Word",
|
|
"Microsoft Excel": "Microsoft Excel",
|
|
"Inline XBRL (SEC/EDGAR standard)": "Inline XBRL (unattributed)",
|
|
"Inline XBRL (utf-8 toolchain)": "Inline XBRL (unattributed)",
|
|
"Inline XBRL (tool unresolved)": "Inline XBRL (unattributed)",
|
|
"SGML-wrapped (legacy/font-based)": "SGML-wrapped (unattributed)",
|
|
"SGML-wrapped (unknown)": "SGML-wrapped (unattributed)",
|
|
"Legacy generator (font-based)": "Other/Legacy",
|
|
"Table-based generator": "Other/Legacy",
|
|
"Modern web tooling": "Other/Legacy",
|
|
"Unknown": "Unknown",
|
|
}
|
|
|
|
family_counts: Counter = Counter()
|
|
family_examples: dict[str, list[str]] = defaultdict(list)
|
|
|
|
for gen, count in gen_counts.items():
|
|
family = family_map.get(gen, gen)
|
|
family_counts[family] += count
|
|
family_examples[family].extend(generator_examples[gen][:3])
|
|
|
|
print(f"\n{'Tool Family':<45} {'Count':>7} {'%':>7}")
|
|
print("-" * 65)
|
|
for family, count in family_counts.most_common():
|
|
pct = count / total * 100
|
|
examples = ", ".join(family_examples[family][:3])
|
|
print(f"{family:<45} {count:>7} {pct:>6.1f}% {examples}")
|
|
print("-" * 65)
|
|
print(f"{'TOTAL':<45} {total:>7} {100.0:>6.1f}%")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|