SEC-cyBERT/scripts/detect_generators.py

#!/usr/bin/env python3
"""
Detect HTML generators for all SEC filing HTML files.
Phase 1: Exhaustive signature detection
Phase 2: Cluster remaining unknowns
Phase 3: Summary statistics
"""

import os
import re
import sys
from collections import defaultdict, Counter
from pathlib import Path

HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
READ_BYTES = 20_000

# Known SEC filing agent CIKs (accession number prefixes)
FILING_AGENT_CIKS = {
    "0000950170": "Donnelley Financial Solutions",
    "0001193125": "Donnelley Financial Solutions",
    "0001558370": "Toppan Merrill",
    "0001654954": "Toppan Merrill",
}


def detect_generator(filepath: str) -> tuple[str, str]:
    """Read first 20KB of file and detect generator. Returns (generator, evidence)."""
    with open(filepath, "rb") as f:
        raw = f.read(READ_BYTES)

    text = raw.decode("utf-8", errors="replace")
    text_lower = text.lower()

    # --- Explicit generator metadata ---

    # 1. <meta name="generator" content="..."> (both attribute orderings)
    m = re.search(
        r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
        text, re.I,
    )
    if not m:
        m = re.search(
            r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']',
            text, re.I,
        )
    if m:
        return _normalize_generator(m.group(1)), f'meta generator: {m.group(1)}'

    # 2. <meta name="Creator" content="...">
    m = re.search(
        r'<meta\s+name\s*=\s*["\']Creator["\']\s+content\s*=\s*["\']([^"\']+)["\']',
        text, re.I,
    )
    if m:
        return _normalize_generator(m.group(1)), f'meta Creator: {m.group(1)}'

    # 4. <meta name="Producer" content="...">
    m = re.search(
        r'<meta\s+name\s*=\s*["\']Producer["\']\s+content\s*=\s*["\']([^"\']+)["\']',
        text, re.I,
    )
    if m:
        return _normalize_generator(m.group(1)), f'meta Producer: {m.group(1)}'

    # 15. ProgId meta tag (Word, Excel converters)
    m = re.search(
        r'<meta\s+name\s*=\s*["\']ProgId["\']\s+content\s*=\s*["\']([^"\']+)["\']',
        text, re.I,
    )
    if m:
        progid = m.group(1)
        if "word" in progid.lower():
            return "Microsoft Word", f"ProgId: {progid}"
        if "excel" in progid.lower():
            return "Microsoft Excel", f"ProgId: {progid}"
        return _normalize_generator(progid), f"ProgId: {progid}"

    # --- HTML comment signatures (search full 20KB) ---

    # Workiva / Wdesk
    if re.search(r"<!--.*Created with the Workiva Platform.*-->", text, re.I):
        return "Workiva", "comment: Created with the Workiva Platform"
    if re.search(r"<!--.*Copyright\s+\d{4}\s+Workiva.*-->", text, re.I):
        return "Workiva", "comment: Copyright Workiva"
    if re.search(r"<!--.*Document created using Wdesk.*-->", text, re.I):
        return "Workiva", "comment: Document created using Wdesk"

    # Toppan Merrill / Bridge
    if re.search(r"<!--.*(?:Toppan\s*Merrill|iXBRL document created with.*Toppan).*-->", text, re.I):
        return "Toppan Merrill", "comment: Toppan Merrill"
    if re.search(r"<!--.*Merrill\s*Bridge.*-->", text, re.I):
        return "Toppan Merrill", "comment: Merrill Bridge"

    # Donnelley Financial Solutions / RR Donnelley
    if re.search(r"<!--.*Donnelley Financial Solutions.*-->", text, re.I):
        return "Donnelley Financial Solutions", "comment: Donnelley Financial Solutions"
    if re.search(r"<!--.*RR\s*Donnelley.*-->", text, re.I):
        return "Donnelley Financial Solutions", "comment: RR Donnelley"

    # Broadridge PROfile
    if re.search(r"<!--.*Broadridge\s+PROfile.*-->", text, re.I):
        return "Broadridge PROfile", "comment: Broadridge PROfile"
    # Also match "Licensed to: ... Document created using Broadridge PROfile"
    if "broadridge" in text_lower:
        return "Broadridge PROfile", "keyword: broadridge"

    # SEC Publisher (in title or comment)
    m_title = re.search(r"<title[^>]*>([^<]+)</title>", text, re.I)
    title_text = m_title.group(1).strip() if m_title else ""
    if "sec publisher" in text_lower or "sec publisher" in title_text.lower():
        return "SEC Publisher", "title/keyword: SEC Publisher"

    # IRIS Carbon (various filing agents using IRIS Carbon platform)
    m = re.search(r"<!--.*Powered by IRIS Carbon.*-->", text, re.I)
    if m:
        # Extract the filing agent name before "Powered by IRIS Carbon"
        m2 = re.search(r"<!--\s*([^,]+),\s*Powered by IRIS Carbon", text, re.I)
        agent = m2.group(1).strip() if m2 else "Unknown agent"
        return "IRIS Carbon", f"comment: {agent} via IRIS Carbon"

    # Certent Disclosure Management
    if re.search(r"<!--.*Certent\s+Disclosure\s+Management.*-->", text, re.I):
        return "Certent", "comment: Certent Disclosure Management"
    if "certent" in text_lower:
        return "Certent", "keyword: certent"

    # CompSci Resources, LLC
    if re.search(r"<!--.*CompSci Resources.*-->", text, re.I):
        return "CompSci Transform", "comment: CompSci Resources"

    # RDG Portal
    if re.search(r"<!--.*RDG Portal.*-->", text, re.I):
        return "RDG Portal", "comment: RDG Portal"

    # PDF to EDGAR
    if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]:
        return "PDF to EDGAR", "title/keyword: PDF to EDGAR"

    # Generic generated/created by comments (but NOT bare dates)
    m = re.search(r"<!--\s*Generated\s+by\s+([^-]+?)-->", text, re.I)
    if m:
        val = m.group(1).strip()
        if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
            return _normalize_generator(val), f"comment: Generated by {val}"
    m = re.search(r"<!--\s*Created\s+(?:by|with)\s+([^-]+?)-->", text, re.I)
    if m:
        val = m.group(1).strip()
        if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
            return _normalize_generator(val), f"comment: Created by/with {val}"

    # --- Keyword signatures in full text ---

    # 5. Workiva
    if re.search(r"\bwdesk\b", text_lower):
        return "Workiva", "keyword: wdesk"
    if re.search(r"\bworkiva\b", text_lower):
        return "Workiva", "keyword: workiva"

    # 6. Donnelley/DFIN
    if re.search(r"\brrdonnelley\b", text_lower):
        return "Donnelley Financial Solutions", "keyword: rrdonnelley"
    if re.search(r"\bedgar-online\b", text_lower):
        return "Donnelley Financial Solutions", "keyword: edgar-online"

    # 7. Toppan Merrill
    if re.search(r"\btoppan\b", text_lower):
        return "Toppan Merrill", "keyword: toppan"
    if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower):
        return "Toppan Merrill", "keyword: merrill + bridge/xbrl"
    if re.search(r"\bbowne\b", text_lower):
        return "Toppan Merrill", "keyword: bowne"

    # 8. CompSci Transform
    if re.search(r"\bcompsci\b", text_lower):
        return "CompSci Transform", "keyword: compsci"

    # 9. ThunderDome
    if re.search(r"\bthunderdome\b", text_lower):
        return "ThunderDome", "keyword: thunderdome"

    # 10. GoXBRL
    if re.search(r"\bgoxbrl\b", text_lower):
        return "GoXBRL", "keyword: goxbrl"

    # 16. CSS class naming patterns
    if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower):
        return "Workiva", "CSS class prefix: wk_"

    # --- SGML document wrapper detection ---
    has_sgml = re.search(r"<DOCUMENT>\s*\n?\s*<TYPE>", text, re.I)
    if has_sgml:
        m_fn = re.search(r"<FILENAME>\s*([\w\-\.]+)", text, re.I)
        if m_fn:
            filename = m_fn.group(1).lower()
            # d + digits = Donnelley Financial Solutions
            if re.match(r"d\d+", filename):
                return "Donnelley Financial Solutions", f"SGML filename: {m_fn.group(1)}"
            # tm + digits = Toppan Merrill
            if re.match(r"tm\d+", filename):
                return "Toppan Merrill", f"SGML filename: {m_fn.group(1)}"
            # ea + digits = EFiling/EDGAR Agent
            if re.match(r"ea\d+", filename):
                return "EFiling/EDGAR Agent", f"SGML filename: {m_fn.group(1)}"

        # SGML-wrapped but no known filename pattern — check for other signals inside
        # Rule-Page comments = Broadridge/EFiling variant
        if "<!-- field: rule-page" in text_lower or "rule-page" in text_lower[:5000]:
            return "Broadridge PROfile", "SGML + Rule-Page field comments"

        # Field: Set comments with xdx = EFiling XDX tool
        if "field: set; name: xdx" in text_lower:
            return "EFiling XDX", "SGML + xdx Field:Set comments"

        # <!-- Field: Set --> or <!-- Field: Rule --> without xdx
        if "<!-- field:" in text_lower[:5000]:
            return "EFiling/EDGAR Agent", "SGML + Field comments"

        # Donnelley structural pattern: Center/DIV 8.5in
        if re.search(r'<Center><DIV STYLE="width:8\.5in"', text):
            return "Donnelley Financial Solutions", "SGML + Center/DIV 8.5in layout"

        # Check accession prefix for known filing agents
        basename = os.path.basename(filepath)
        accession_prefix = basename.split("-")[0]
        if accession_prefix in FILING_AGENT_CIKS:
            return FILING_AGENT_CIKS[accession_prefix], f"SGML + filing agent CIK {accession_prefix}"

        # Remaining SGML-wrapped: classify by structural patterns
        font_count = text_lower.count("<font")
        if font_count > 5:
            return "SGML-wrapped (legacy/font-based)", f"SGML + {font_count} <font> tags"

        return "SGML-wrapped (unknown)", "SGML wrapper, no specific generator"

    # --- Inline XBRL detection for non-SGML files ---
    has_ix_ns = "xmlns:ix=" in text_lower or "<ix:header" in text_lower

    # 12. Structural: Donnelley uppercase P STYLE + Center DIV 8.5in
    if re.search(r'<P STYLE="[^"]*font-family:Times New Roman"', text) and re.search(
        r'<Center><DIV STYLE="width:8\.5in"', text
    ):
        return "Donnelley Financial Solutions", "structural: uppercase P STYLE + Center DIV 8.5in"

    # 14. Title tag tool names
    if title_text:
        title_lower = title_text.lower()
        if "workiva" in title_lower or "wdesk" in title_lower:
            return "Workiva", f"title: {title_text}"

    if has_ix_ns:
        # 11. ix:header with tool info / Field comments
        if "field: set; name: xdx" in text_lower:
            return "EFiling XDX", "iXBRL + xdx Field:Set comments"

        if "<!-- field: rule" in text_lower:
            return "Broadridge PROfile", "iXBRL + Rule-Page field comments"

        if "<!-- field:" in text_lower[:5000]:
            return "EFiling/EDGAR Agent", "iXBRL + Field comments"

        # Filing agent CIK-based detection
        basename = os.path.basename(filepath)
        accession_prefix = basename.split("-")[0]
        if accession_prefix in FILING_AGENT_CIKS:
            agent = FILING_AGENT_CIKS[accession_prefix]
            return f"{agent}", f"iXBRL + filing agent CIK {accession_prefix}"

        # 13. XML declaration encoding as structural signal
        if '<?xml version="1.0" encoding="utf-8"' in text_lower[:200]:
            return "Inline XBRL (utf-8 toolchain)", "iXBRL + utf-8 XML declaration"

        if "<?xml version='1.0' encoding='ascii'?>" in text_lower[:200]:
            if re.search(r'<div style="display:none"><ix:header>', text_lower[:3000]):
                return "Inline XBRL (SEC/EDGAR standard)", "iXBRL + ASCII XML + hidden ix:header"
            return "Inline XBRL (SEC/EDGAR standard)", "iXBRL + ASCII XML declaration"

        # Generic inline XBRL with no other signal
        return "Inline XBRL (tool unresolved)", "iXBRL namespace only"

    # --- Structural fallbacks for non-XBRL files ---
    font_count = text_lower.count("<font")
    td_count = text_lower.count("<td")
    span_count = text_lower.count("<span")

    if font_count > 20:
        return "Legacy generator (font-based)", f"structural: {font_count} <font> tags"

    if td_count > 50 and span_count < 10:
        return "Table-based generator", f"structural: {td_count} <td> tags"

    data_attr_count = len(re.findall(r"\bdata-\w+", text_lower))
    if data_attr_count > 10:
        return "Modern web tooling", f"structural: {data_attr_count} data- attributes"

    return "Unknown", "no signature detected"


def _normalize_generator(raw: str) -> str:
    """Normalize generator names to canonical forms."""
    r = raw.strip().lower()
    if "workiva" in r or "wdesk" in r:
        return "Workiva"
    if "donnelley" in r or "dfin" in r or "rrdonnelley" in r:
        return "Donnelley Financial Solutions"
    if ("toppan" in r) or ("merrill" in r and "bridge" in r):
        return "Toppan Merrill"
    if "word" in r and "microsoft" in r:
        return "Microsoft Word"
    if "excel" in r and "microsoft" in r:
        return "Microsoft Excel"
    if "thunderdome" in r:
        return "ThunderDome"
    if "goxbrl" in r:
        return "GoXBRL"
    if "compsci" in r:
        return "CompSci Transform"
    if "certent" in r:
        return "Certent"
    if "iris carbon" in r:
        return "IRIS Carbon"
    if "broadridge" in r or "profile" in r:
        return "Broadridge PROfile"
    if "sec publisher" in r:
        return "SEC Publisher"
    return raw.strip()


def extract_body_snippet(filepath: str) -> str:
    """Extract first 200 bytes after <body> tag."""
    with open(filepath, "rb") as f:
        raw = f.read(READ_BYTES)
    text = raw.decode("utf-8", errors="replace")
    m = re.search(r"<body[^>]*>(.*)", text, re.I | re.S)
    if m:
        body = m.group(1)[:200].strip()
        return re.sub(r"\s+", " ", body)
    return re.sub(r"\s+", " ", text[:200])


def extract_class_names(filepath: str, max_elements: int = 10) -> list[str]:
    """Extract CSS class names from first N elements."""
    with open(filepath, "rb") as f:
        raw = f.read(READ_BYTES)
    text = raw.decode("utf-8", errors="replace")
    classes = re.findall(r'class\s*=\s*["\']([^"\']+)["\']', text, re.I)
    return classes[:max_elements]


def main():
    files = sorted(HTML_DIR.glob("*.html"))
    total = len(files)
    print(f"Processing {total} HTML files...\n")

    results: dict[str, tuple[str, str]] = {}
    generator_examples: dict[str, list[str]] = defaultdict(list)
    generator_methods: dict[str, set[str]] = defaultdict(set)

    for i, fp in enumerate(files):
        accession = fp.stem
        gen, evidence = detect_generator(str(fp))
        results[accession] = (gen, evidence)
        generator_examples[gen].append(accession)
        method = evidence.split(":")[0].strip()
        generator_methods[gen].add(method)

        if (i + 1) % 2000 == 0:
            print(f"  Processed {i + 1}/{total}...", file=sys.stderr)

    # --- Phase 1 output ---
    print("=" * 110)
    print("PHASE 1: Generator Detection Results")
    print("=" * 110)

    gen_counts = Counter(gen for gen, _ in results.values())

    for gen, count in gen_counts.most_common():
        pct = count / total * 100
        examples = generator_examples[gen][:3]
        methods = ", ".join(sorted(generator_methods[gen]))
        print(f"\n  {gen}")
        print(f"    Count: {count:,} ({pct:.1f}%)")
        print(f"    Methods: {methods}")
        print(f"    Examples: {', '.join(examples)}")

    # --- Phase 2: Cluster unknowns ---
    unknowns = [acc for acc, (gen, _) in results.items() if gen == "Unknown"]
    print(f"\n\n{'=' * 110}")
    print(f"PHASE 2: Clustering {len(unknowns)} Unknown Files")
    print("=" * 110)

    if unknowns:
        fingerprints: dict[str, list[str]] = defaultdict(list)

        for acc in unknowns:
            fp = HTML_DIR / f"{acc}.html"
            with open(fp, "rb") as f:
                raw_bytes = f.read(READ_BYTES)
            text = raw_bytes.decode("utf-8", errors="replace")
            text_lower = text.lower()

            has_xml_decl = text.startswith("<?xml")
            has_doctype = "<!doctype" in text_lower[:500]
            first_tag_m = re.search(r"<(\w+)", text)
            first_tag = first_tag_m.group(1).lower() if first_tag_m else ""

            td_c = text_lower.count("<td")
            span_c = text_lower.count("<span")
            div_c = text_lower.count("<div")
            p_c = text_lower.count("<p ")
            font_c = text_lower.count("<font")

            counts = {"td": td_c, "span": span_c, "div": div_c, "p": p_c, "font": font_c}
            dominant = max(counts, key=counts.get) if max(counts.values()) > 0 else "empty"

            classes = re.findall(r'class\s*=\s*["\']([^"\']+)["\']', text[:5000], re.I)
            class_prefix = ""
            if classes:
                fc = classes[0].split()[0]
                if "_" in fc:
                    class_prefix = fc.split("_")[0] + "_"
                elif "-" in fc:
                    class_prefix = fc.split("-")[0] + "-"
                else:
                    class_prefix = fc[:4]

            fingerprint = (
                f"xml={has_xml_decl}|doctype={has_doctype}|first={first_tag}"
                f"|layout={dominant}|cls={class_prefix}"
            )
            fingerprints[fingerprint].append(acc)

        for idx, (fp_key, accs) in enumerate(
            sorted(fingerprints.items(), key=lambda x: -len(x[1]))
        ):
            print(f"\n  Cluster {idx + 1} ({len(accs)} files): {fp_key}")
            for acc in accs[:5]:
                filepath = HTML_DIR / f"{acc}.html"
                snippet = extract_body_snippet(str(filepath))
                cls = extract_class_names(str(filepath), 5)
                print(f"    {acc}:")
                print(f"      Snippet: {snippet[:120]}")
                if cls:
                    print(f"      Classes: {cls[:5]}")
            if len(accs) > 5:
                print(f"    ... and {len(accs) - 5} more files")
    else:
        print("  No truly unknown files remain!")

    # --- Phase 3: Summary ---
    print(f"\n\n{'=' * 110}")
    print("PHASE 3: Summary Statistics")
    print("=" * 110)

    # Compute consolidated generator groups for the summary
    # Group small variants under their parent
    GROUP_MAP = {
        "Inline XBRL (utf-8 toolchain)": "Inline XBRL (tool unresolved)",
        "Inline XBRL (tool unresolved)": "Inline XBRL (tool unresolved)",
    }

    header = (
        f"\n{'Generator':<45} {'Count':>7} {'%':>7}  "
        f"{'Detection Methods':<50} {'Examples (up to 3)'}"
    )
    print(header)
    print("-" * 170)

    for gen, count in gen_counts.most_common():
        pct = count / total * 100
        examples = ", ".join(generator_examples[gen][:3])
        methods = ", ".join(sorted(generator_methods[gen]))
        if len(methods) > 50:
            methods = methods[:47] + "..."
        print(f"{gen:<45} {count:>7} {pct:>6.1f}%  {methods:<50} {examples}")

    print("-" * 170)
    print(f"{'TOTAL':<45} {total:>7} {100.0:>6.1f}%")

    unknown_count = gen_counts.get("Unknown", 0)
    identified = total - unknown_count
    print(f"\nIdentified: {identified:,} / {total:,} ({identified / total * 100:.1f}%)")
    print(f"Truly unidentified: {unknown_count:,} / {total:,} ({unknown_count / total * 100:.1f}%)")

    # Consolidated view: group by parent tool family
    print(f"\n\n{'=' * 110}")
    print("CONSOLIDATED VIEW (grouped by tool family)")
    print("=" * 110)

    family_map = {
        "Workiva": "Workiva",
        "Donnelley Financial Solutions": "Donnelley Financial Solutions",
        "Toppan Merrill": "Toppan Merrill",
        "CompSci Transform": "CompSci Transform",
        "ThunderDome": "ThunderDome",
        "EFiling/EDGAR Agent": "EFiling/EDGAR Agent",
        "EFiling XDX": "EFiling/EDGAR Agent",
        "Broadridge PROfile": "Broadridge PROfile",
        "SEC Publisher": "SEC Publisher",
        "IRIS Carbon": "IRIS Carbon",
        "RDG Portal": "RDG Portal",
        "Certent": "Certent",
        "PDF to EDGAR": "PDF to EDGAR",
        "GoXBRL": "GoXBRL",
        "Microsoft Word": "Microsoft Word",
        "Microsoft Excel": "Microsoft Excel",
        "Inline XBRL (SEC/EDGAR standard)": "Inline XBRL (unattributed)",
        "Inline XBRL (utf-8 toolchain)": "Inline XBRL (unattributed)",
        "Inline XBRL (tool unresolved)": "Inline XBRL (unattributed)",
        "SGML-wrapped (legacy/font-based)": "SGML-wrapped (unattributed)",
        "SGML-wrapped (unknown)": "SGML-wrapped (unattributed)",
        "Legacy generator (font-based)": "Other/Legacy",
        "Table-based generator": "Other/Legacy",
        "Modern web tooling": "Other/Legacy",
        "Unknown": "Unknown",
    }

    family_counts: Counter = Counter()
    family_examples: dict[str, list[str]] = defaultdict(list)

    for gen, count in gen_counts.items():
        family = family_map.get(gen, gen)
        family_counts[family] += count
        family_examples[family].extend(generator_examples[gen][:3])

    print(f"\n{'Tool Family':<45} {'Count':>7} {'%':>7}")
    print("-" * 65)
    for family, count in family_counts.most_common():
        pct = count / total * 100
        examples = ", ".join(family_examples[family][:3])
        print(f"{family:<45} {count:>7} {pct:>6.1f}%  {examples}")
    print("-" * 65)
    print(f"{'TOTAL':<45} {total:>7} {100.0:>6.1f}%")


if __name__ == "__main__":
    main()