#!/usr/bin/env python3 """ Cross-reference SEC filing generators with paragraph quality metrics. Reuses detection logic from detect_generators.py, then computes quality metrics per generator from paragraphs-clean.jsonl. """ import json import os import re import sys import statistics from collections import defaultdict, Counter from pathlib import Path HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html") PARAGRAPHS_FILE = Path("/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl") READ_BYTES = 20_000 # ── Generator detection (copied from detect_generators.py) ── FILING_AGENT_CIKS = { "0000950170": "Donnelley Financial Solutions", "0001193125": "Donnelley Financial Solutions", "0001558370": "Toppan Merrill", "0001654954": "Toppan Merrill", } def _normalize_generator(raw: str) -> str: r = raw.strip().lower() if "workiva" in r or "wdesk" in r: return "Workiva" if "donnelley" in r or "dfin" in r or "rrdonnelley" in r: return "Donnelley Financial Solutions" if ("toppan" in r) or ("merrill" in r and "bridge" in r): return "Toppan Merrill" if "word" in r and "microsoft" in r: return "Microsoft Word" if "excel" in r and "microsoft" in r: return "Microsoft Excel" if "thunderdome" in r: return "ThunderDome" if "goxbrl" in r: return "GoXBRL" if "compsci" in r: return "CompSci Transform" if "certent" in r: return "Certent" if "iris carbon" in r: return "IRIS Carbon" if "broadridge" in r or "profile" in r: return "Broadridge PROfile" if "sec publisher" in r: return "SEC Publisher" return raw.strip() def detect_generator(filepath: str) -> str: """Read first 20KB and return generator name.""" with open(filepath, "rb") as f: raw = f.read(READ_BYTES) text = raw.decode("utf-8", errors="replace") text_lower = text.lower() # meta generator m = re.search(r'", text, re.I): return "Workiva" if re.search(r"", text, re.I): return "Workiva" if re.search(r"", text, re.I): return "Workiva" if re.search(r"", text, re.I): return "Toppan Merrill" if re.search(r"", text, re.I): return "Toppan Merrill" if re.search(r"", text, re.I): return "Donnelley Financial Solutions" if re.search(r"", text, re.I): return "Donnelley Financial Solutions" if re.search(r"", text, re.I): return "Broadridge PROfile" if "broadridge" in text_lower: return "Broadridge PROfile" m_title = re.search(r"]*>([^<]+)", text, re.I) title_text = m_title.group(1).strip() if m_title else "" if "sec publisher" in text_lower or "sec publisher" in title_text.lower(): return "SEC Publisher" m = re.search(r"", text, re.I) if m: return "IRIS Carbon" if re.search(r"", text, re.I): return "Certent" if "certent" in text_lower: return "Certent" if re.search(r"", text, re.I): return "CompSci Transform" if re.search(r"", text, re.I): return "RDG Portal" if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]: return "PDF to EDGAR" m = re.search(r"", text, re.I) if m: val = m.group(1).strip() if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val): return _normalize_generator(val) m = re.search(r"", text, re.I) if m: val = m.group(1).strip() if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val): return _normalize_generator(val) # Keyword signatures if re.search(r"\bwdesk\b", text_lower): return "Workiva" if re.search(r"\bworkiva\b", text_lower): return "Workiva" if re.search(r"\brrdonnelley\b", text_lower): return "Donnelley Financial Solutions" if re.search(r"\bedgar-online\b", text_lower): return "Donnelley Financial Solutions" if re.search(r"\btoppan\b", text_lower): return "Toppan Merrill" if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower): return "Toppan Merrill" if re.search(r"\bbowne\b", text_lower): return "Toppan Merrill" if re.search(r"\bcompsci\b", text_lower): return "CompSci Transform" if re.search(r"\bthunderdome\b", text_lower): return "ThunderDome" if re.search(r"\bgoxbrl\b", text_lower): return "GoXBRL" if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower): return "Workiva" # SGML document wrapper has_sgml = re.search(r"\s*\n?\s*", text, re.I) if has_sgml: m_fn = re.search(r"\s*([\w\-\.]+)", text, re.I) if m_fn: filename = m_fn.group(1).lower() if re.match(r"d\d+", filename): return "Donnelley Financial Solutions" if re.match(r"tm\d+", filename): return "Toppan Merrill" if re.match(r"ea\d+", filename): return "EFiling/EDGAR Agent" if "