#!/usr/bin/env python3 """ Cross-reference SEC filing generators with paragraph quality metrics. Reuses detection logic from detect_generators.py, then computes quality metrics per generator from paragraphs-clean.jsonl. """ import json import os import re import sys import statistics from collections import defaultdict, Counter from pathlib import Path HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html") PARAGRAPHS_FILE = Path("/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl") READ_BYTES = 20_000 # ── Generator detection (copied from detect_generators.py) ── FILING_AGENT_CIKS = { "0000950170": "Donnelley Financial Solutions", "0001193125": "Donnelley Financial Solutions", "0001558370": "Toppan Merrill", "0001654954": "Toppan Merrill", } def _normalize_generator(raw: str) -> str: r = raw.strip().lower() if "workiva" in r or "wdesk" in r: return "Workiva" if "donnelley" in r or "dfin" in r or "rrdonnelley" in r: return "Donnelley Financial Solutions" if ("toppan" in r) or ("merrill" in r and "bridge" in r): return "Toppan Merrill" if "word" in r and "microsoft" in r: return "Microsoft Word" if "excel" in r and "microsoft" in r: return "Microsoft Excel" if "thunderdome" in r: return "ThunderDome" if "goxbrl" in r: return "GoXBRL" if "compsci" in r: return "CompSci Transform" if "certent" in r: return "Certent" if "iris carbon" in r: return "IRIS Carbon" if "broadridge" in r or "profile" in r: return "Broadridge PROfile" if "sec publisher" in r: return "SEC Publisher" return raw.strip() def detect_generator(filepath: str) -> str: """Read first 20KB and return generator name.""" with open(filepath, "rb") as f: raw = f.read(READ_BYTES) text = raw.decode("utf-8", errors="replace") text_lower = text.lower() # meta generator m = re.search(r'", text, re.I): return "Workiva" if re.search(r"", text, re.I): return "Workiva" if re.search(r"", text, re.I): return "Workiva" if re.search(r"", text, re.I): return "Toppan Merrill" if re.search(r"", text, re.I): return "Toppan Merrill" if re.search(r"", text, re.I): return "Donnelley Financial Solutions" if re.search(r"", text, re.I): return "Donnelley Financial Solutions" if re.search(r"", text, re.I): return "Broadridge PROfile" if "broadridge" in text_lower: return "Broadridge PROfile" m_title = re.search(r"