#!/usr/bin/env python3 """ Detect HTML generators for all SEC filing HTML files. Phase 1: Exhaustive signature detection Phase 2: Cluster remaining unknowns Phase 3: Summary statistics """ import os import re import sys from collections import defaultdict, Counter from pathlib import Path HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html") READ_BYTES = 20_000 # Known SEC filing agent CIKs (accession number prefixes) FILING_AGENT_CIKS = { "0000950170": "Donnelley Financial Solutions", "0001193125": "Donnelley Financial Solutions", "0001558370": "Toppan Merrill", "0001654954": "Toppan Merrill", } def detect_generator(filepath: str) -> tuple[str, str]: """Read first 20KB of file and detect generator. Returns (generator, evidence).""" with open(filepath, "rb") as f: raw = f.read(READ_BYTES) text = raw.decode("utf-8", errors="replace") text_lower = text.lower() # --- Explicit generator metadata --- # 1. (both attribute orderings) m = re.search( r' m = re.search( r' m = re.search( r'", text, re.I): return "Workiva", "comment: Created with the Workiva Platform" if re.search(r"", text, re.I): return "Workiva", "comment: Copyright Workiva" if re.search(r"", text, re.I): return "Workiva", "comment: Document created using Wdesk" # Toppan Merrill / Bridge if re.search(r"", text, re.I): return "Toppan Merrill", "comment: Toppan Merrill" if re.search(r"", text, re.I): return "Toppan Merrill", "comment: Merrill Bridge" # Donnelley Financial Solutions / RR Donnelley if re.search(r"", text, re.I): return "Donnelley Financial Solutions", "comment: Donnelley Financial Solutions" if re.search(r"", text, re.I): return "Donnelley Financial Solutions", "comment: RR Donnelley" # Broadridge PROfile if re.search(r"", text, re.I): return "Broadridge PROfile", "comment: Broadridge PROfile" # Also match "Licensed to: ... Document created using Broadridge PROfile" if "broadridge" in text_lower: return "Broadridge PROfile", "keyword: broadridge" # SEC Publisher (in title or comment) m_title = re.search(r"]*>([^<]+)", text, re.I) title_text = m_title.group(1).strip() if m_title else "" if "sec publisher" in text_lower or "sec publisher" in title_text.lower(): return "SEC Publisher", "title/keyword: SEC Publisher" # IRIS Carbon (various filing agents using IRIS Carbon platform) m = re.search(r"", text, re.I) if m: # Extract the filing agent name before "Powered by IRIS Carbon" m2 = re.search(r"", text, re.I): return "Certent", "comment: Certent Disclosure Management" if "certent" in text_lower: return "Certent", "keyword: certent" # CompSci Resources, LLC if re.search(r"", text, re.I): return "CompSci Transform", "comment: CompSci Resources" # RDG Portal if re.search(r"", text, re.I): return "RDG Portal", "comment: RDG Portal" # PDF to EDGAR if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]: return "PDF to EDGAR", "title/keyword: PDF to EDGAR" # Generic generated/created by comments (but NOT bare dates) m = re.search(r"", text, re.I) if m: val = m.group(1).strip() if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val): return _normalize_generator(val), f"comment: Generated by {val}" m = re.search(r"", text, re.I) if m: val = m.group(1).strip() if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val): return _normalize_generator(val), f"comment: Created by/with {val}" # --- Keyword signatures in full text --- # 5. Workiva if re.search(r"\bwdesk\b", text_lower): return "Workiva", "keyword: wdesk" if re.search(r"\bworkiva\b", text_lower): return "Workiva", "keyword: workiva" # 6. Donnelley/DFIN if re.search(r"\brrdonnelley\b", text_lower): return "Donnelley Financial Solutions", "keyword: rrdonnelley" if re.search(r"\bedgar-online\b", text_lower): return "Donnelley Financial Solutions", "keyword: edgar-online" # 7. Toppan Merrill if re.search(r"\btoppan\b", text_lower): return "Toppan Merrill", "keyword: toppan" if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower): return "Toppan Merrill", "keyword: merrill + bridge/xbrl" if re.search(r"\bbowne\b", text_lower): return "Toppan Merrill", "keyword: bowne" # 8. CompSci Transform if re.search(r"\bcompsci\b", text_lower): return "CompSci Transform", "keyword: compsci" # 9. ThunderDome if re.search(r"\bthunderdome\b", text_lower): return "ThunderDome", "keyword: thunderdome" # 10. GoXBRL if re.search(r"\bgoxbrl\b", text_lower): return "GoXBRL", "keyword: goxbrl" # 16. CSS class naming patterns if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower): return "Workiva", "CSS class prefix: wk_" # --- SGML document wrapper detection --- has_sgml = re.search(r"\s*\n?\s*", text, re.I) if has_sgml: m_fn = re.search(r"\s*([\w\-\.]+)", text, re.I) if m_fn: filename = m_fn.group(1).lower() # d + digits = Donnelley Financial Solutions if re.match(r"d\d+", filename): return "Donnelley Financial Solutions", f"SGML filename: {m_fn.group(1)}" # tm + digits = Toppan Merrill if re.match(r"tm\d+", filename): return "Toppan Merrill", f"SGML filename: {m_fn.group(1)}" # ea + digits = EFiling/EDGAR Agent if re.match(r"ea\d+", filename): return "EFiling/EDGAR Agent", f"SGML filename: {m_fn.group(1)}" # SGML-wrapped but no known filename pattern — check for other signals inside # Rule-Page comments = Broadridge/EFiling variant if " or without xdx if "