#!/usr/bin/env python3
"""
Detect HTML generators for all SEC filing HTML files.
Phase 1: Exhaustive signature detection
Phase 2: Cluster remaining unknowns
Phase 3: Summary statistics
"""
import os
import re
import sys
from collections import defaultdict, Counter
from pathlib import Path
HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
READ_BYTES = 20_000
# Known SEC filing agent CIKs (accession number prefixes)
FILING_AGENT_CIKS = {
"0000950170": "Donnelley Financial Solutions",
"0001193125": "Donnelley Financial Solutions",
"0001558370": "Toppan Merrill",
"0001654954": "Toppan Merrill",
}
def detect_generator(filepath: str) -> tuple[str, str]:
"""Read first 20KB of file and detect generator. Returns (generator, evidence)."""
with open(filepath, "rb") as f:
raw = f.read(READ_BYTES)
text = raw.decode("utf-8", errors="replace")
text_lower = text.lower()
# --- Explicit generator metadata ---
# 1. (both attribute orderings)
m = re.search(
r'
m = re.search(
r'
m = re.search(
r'", text, re.I):
return "Workiva", "comment: Created with the Workiva Platform"
if re.search(r"", text, re.I):
return "Workiva", "comment: Copyright Workiva"
if re.search(r"", text, re.I):
return "Workiva", "comment: Document created using Wdesk"
# Toppan Merrill / Bridge
if re.search(r"", text, re.I):
return "Toppan Merrill", "comment: Toppan Merrill"
if re.search(r"", text, re.I):
return "Toppan Merrill", "comment: Merrill Bridge"
# Donnelley Financial Solutions / RR Donnelley
if re.search(r"", text, re.I):
return "Donnelley Financial Solutions", "comment: Donnelley Financial Solutions"
if re.search(r"", text, re.I):
return "Donnelley Financial Solutions", "comment: RR Donnelley"
# Broadridge PROfile
if re.search(r"", text, re.I):
return "Broadridge PROfile", "comment: Broadridge PROfile"
# Also match "Licensed to: ... Document created using Broadridge PROfile"
if "broadridge" in text_lower:
return "Broadridge PROfile", "keyword: broadridge"
# SEC Publisher (in title or comment)
m_title = re.search(r"
]*>([^<]+)", text, re.I)
title_text = m_title.group(1).strip() if m_title else ""
if "sec publisher" in text_lower or "sec publisher" in title_text.lower():
return "SEC Publisher", "title/keyword: SEC Publisher"
# IRIS Carbon (various filing agents using IRIS Carbon platform)
m = re.search(r"", text, re.I)
if m:
# Extract the filing agent name before "Powered by IRIS Carbon"
m2 = re.search(r"", text, re.I):
return "Certent", "comment: Certent Disclosure Management"
if "certent" in text_lower:
return "Certent", "keyword: certent"
# CompSci Resources, LLC
if re.search(r"", text, re.I):
return "CompSci Transform", "comment: CompSci Resources"
# RDG Portal
if re.search(r"", text, re.I):
return "RDG Portal", "comment: RDG Portal"
# PDF to EDGAR
if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]:
return "PDF to EDGAR", "title/keyword: PDF to EDGAR"
# Generic generated/created by comments (but NOT bare dates)
m = re.search(r"", text, re.I)
if m:
val = m.group(1).strip()
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
return _normalize_generator(val), f"comment: Generated by {val}"
m = re.search(r"", text, re.I)
if m:
val = m.group(1).strip()
if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
return _normalize_generator(val), f"comment: Created by/with {val}"
# --- Keyword signatures in full text ---
# 5. Workiva
if re.search(r"\bwdesk\b", text_lower):
return "Workiva", "keyword: wdesk"
if re.search(r"\bworkiva\b", text_lower):
return "Workiva", "keyword: workiva"
# 6. Donnelley/DFIN
if re.search(r"\brrdonnelley\b", text_lower):
return "Donnelley Financial Solutions", "keyword: rrdonnelley"
if re.search(r"\bedgar-online\b", text_lower):
return "Donnelley Financial Solutions", "keyword: edgar-online"
# 7. Toppan Merrill
if re.search(r"\btoppan\b", text_lower):
return "Toppan Merrill", "keyword: toppan"
if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower):
return "Toppan Merrill", "keyword: merrill + bridge/xbrl"
if re.search(r"\bbowne\b", text_lower):
return "Toppan Merrill", "keyword: bowne"
# 8. CompSci Transform
if re.search(r"\bcompsci\b", text_lower):
return "CompSci Transform", "keyword: compsci"
# 9. ThunderDome
if re.search(r"\bthunderdome\b", text_lower):
return "ThunderDome", "keyword: thunderdome"
# 10. GoXBRL
if re.search(r"\bgoxbrl\b", text_lower):
return "GoXBRL", "keyword: goxbrl"
# 16. CSS class naming patterns
if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower):
return "Workiva", "CSS class prefix: wk_"
# --- SGML document wrapper detection ---
has_sgml = re.search(r"\s*\n?\s*", text, re.I)
if has_sgml:
m_fn = re.search(r"\s*([\w\-\.]+)", text, re.I)
if m_fn:
filename = m_fn.group(1).lower()
# d + digits = Donnelley Financial Solutions
if re.match(r"d\d+", filename):
return "Donnelley Financial Solutions", f"SGML filename: {m_fn.group(1)}"
# tm + digits = Toppan Merrill
if re.match(r"tm\d+", filename):
return "Toppan Merrill", f"SGML filename: {m_fn.group(1)}"
# ea + digits = EFiling/EDGAR Agent
if re.match(r"ea\d+", filename):
return "EFiling/EDGAR Agent", f"SGML filename: {m_fn.group(1)}"
# SGML-wrapped but no known filename pattern — check for other signals inside
# Rule-Page comments = Broadridge/EFiling variant
if " or without xdx
if "