SEC-cyBERT/scripts/generator_quality_analysis.py

#!/usr/bin/env python3
"""
Cross-reference SEC filing generators with paragraph quality metrics.
Reuses detection logic from detect_generators.py, then computes quality
metrics per generator from paragraphs-clean.jsonl.
"""

import json
import os
import re
import sys
import statistics
from collections import defaultdict, Counter
from pathlib import Path

HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
PARAGRAPHS_FILE = Path("/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl")
READ_BYTES = 20_000

# ── Generator detection (copied from detect_generators.py) ──

FILING_AGENT_CIKS = {
    "0000950170": "Donnelley Financial Solutions",
    "0001193125": "Donnelley Financial Solutions",
    "0001558370": "Toppan Merrill",
    "0001654954": "Toppan Merrill",
}


def _normalize_generator(raw: str) -> str:
    r = raw.strip().lower()
    if "workiva" in r or "wdesk" in r:
        return "Workiva"
    if "donnelley" in r or "dfin" in r or "rrdonnelley" in r:
        return "Donnelley Financial Solutions"
    if ("toppan" in r) or ("merrill" in r and "bridge" in r):
        return "Toppan Merrill"
    if "word" in r and "microsoft" in r:
        return "Microsoft Word"
    if "excel" in r and "microsoft" in r:
        return "Microsoft Excel"
    if "thunderdome" in r:
        return "ThunderDome"
    if "goxbrl" in r:
        return "GoXBRL"
    if "compsci" in r:
        return "CompSci Transform"
    if "certent" in r:
        return "Certent"
    if "iris carbon" in r:
        return "IRIS Carbon"
    if "broadridge" in r or "profile" in r:
        return "Broadridge PROfile"
    if "sec publisher" in r:
        return "SEC Publisher"
    return raw.strip()


def detect_generator(filepath: str) -> str:
    """Read first 20KB and return generator name."""
    with open(filepath, "rb") as f:
        raw = f.read(READ_BYTES)
    text = raw.decode("utf-8", errors="replace")
    text_lower = text.lower()

    # meta generator
    m = re.search(r'<meta\s+name\s*=\s*["\']generator["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
    if not m:
        m = re.search(r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+name\s*=\s*["\']generator["\']', text, re.I)
    if m:
        return _normalize_generator(m.group(1))

    m = re.search(r'<meta\s+name\s*=\s*["\']Creator["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
    if m:
        return _normalize_generator(m.group(1))

    m = re.search(r'<meta\s+name\s*=\s*["\']Producer["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
    if m:
        return _normalize_generator(m.group(1))

    m = re.search(r'<meta\s+name\s*=\s*["\']ProgId["\']\s+content\s*=\s*["\']([^"\']+)["\']', text, re.I)
    if m:
        progid = m.group(1)
        if "word" in progid.lower():
            return "Microsoft Word"
        if "excel" in progid.lower():
            return "Microsoft Excel"
        return _normalize_generator(progid)

    # Comment signatures
    if re.search(r"<!--.*Created with the Workiva Platform.*-->", text, re.I):
        return "Workiva"
    if re.search(r"<!--.*Copyright\s+\d{4}\s+Workiva.*-->", text, re.I):
        return "Workiva"
    if re.search(r"<!--.*Document created using Wdesk.*-->", text, re.I):
        return "Workiva"

    if re.search(r"<!--.*(?:Toppan\s*Merrill|iXBRL document created with.*Toppan).*-->", text, re.I):
        return "Toppan Merrill"
    if re.search(r"<!--.*Merrill\s*Bridge.*-->", text, re.I):
        return "Toppan Merrill"

    if re.search(r"<!--.*Donnelley Financial Solutions.*-->", text, re.I):
        return "Donnelley Financial Solutions"
    if re.search(r"<!--.*RR\s*Donnelley.*-->", text, re.I):
        return "Donnelley Financial Solutions"

    if re.search(r"<!--.*Broadridge\s+PROfile.*-->", text, re.I):
        return "Broadridge PROfile"
    if "broadridge" in text_lower:
        return "Broadridge PROfile"

    m_title = re.search(r"<title[^>]*>([^<]+)</title>", text, re.I)
    title_text = m_title.group(1).strip() if m_title else ""
    if "sec publisher" in text_lower or "sec publisher" in title_text.lower():
        return "SEC Publisher"

    m = re.search(r"<!--.*Powered by IRIS Carbon.*-->", text, re.I)
    if m:
        return "IRIS Carbon"

    if re.search(r"<!--.*Certent\s+Disclosure\s+Management.*-->", text, re.I):
        return "Certent"
    if "certent" in text_lower:
        return "Certent"

    if re.search(r"<!--.*CompSci Resources.*-->", text, re.I):
        return "CompSci Transform"

    if re.search(r"<!--.*RDG Portal.*-->", text, re.I):
        return "RDG Portal"

    if title_text.lower() == "pdf to edgar" or "pdf to edgar" in text_lower[:2000]:
        return "PDF to EDGAR"

    m = re.search(r"<!--\s*Generated\s+by\s+([^-]+?)-->", text, re.I)
    if m:
        val = m.group(1).strip()
        if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
            return _normalize_generator(val)
    m = re.search(r"<!--\s*Created\s+(?:by|with)\s+([^-]+?)-->", text, re.I)
    if m:
        val = m.group(1).strip()
        if not re.match(r"^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}", val):
            return _normalize_generator(val)

    # Keyword signatures
    if re.search(r"\bwdesk\b", text_lower):
        return "Workiva"
    if re.search(r"\bworkiva\b", text_lower):
        return "Workiva"
    if re.search(r"\brrdonnelley\b", text_lower):
        return "Donnelley Financial Solutions"
    if re.search(r"\bedgar-online\b", text_lower):
        return "Donnelley Financial Solutions"
    if re.search(r"\btoppan\b", text_lower):
        return "Toppan Merrill"
    if re.search(r"\bmerrill\b", text_lower) and re.search(r"\b(?:bridge|ixbrl|xbrl)\b", text_lower):
        return "Toppan Merrill"
    if re.search(r"\bbowne\b", text_lower):
        return "Toppan Merrill"
    if re.search(r"\bcompsci\b", text_lower):
        return "CompSci Transform"
    if re.search(r"\bthunderdome\b", text_lower):
        return "ThunderDome"
    if re.search(r"\bgoxbrl\b", text_lower):
        return "GoXBRL"

    if re.search(r'class\s*=\s*["\'][^"\']*\bwk_\w+', text_lower):
        return "Workiva"

    # SGML document wrapper
    has_sgml = re.search(r"<DOCUMENT>\s*\n?\s*<TYPE>", text, re.I)
    if has_sgml:
        m_fn = re.search(r"<FILENAME>\s*([\w\-\.]+)", text, re.I)
        if m_fn:
            filename = m_fn.group(1).lower()
            if re.match(r"d\d+", filename):
                return "Donnelley Financial Solutions"
            if re.match(r"tm\d+", filename):
                return "Toppan Merrill"
            if re.match(r"ea\d+", filename):
                return "EFiling/EDGAR Agent"
        if "<!-- field: rule-page" in text_lower or "rule-page" in text_lower[:5000]:
            return "Broadridge PROfile"
        if "field: set; name: xdx" in text_lower:
            return "EFiling XDX"
        if "<!-- field:" in text_lower[:5000]:
            return "EFiling/EDGAR Agent"
        if re.search(r'<Center><DIV STYLE="width:8\.5in"', text):
            return "Donnelley Financial Solutions"
        basename = os.path.basename(filepath)
        accession_prefix = basename.split("-")[0]
        if accession_prefix in FILING_AGENT_CIKS:
            return FILING_AGENT_CIKS[accession_prefix]
        font_count = text_lower.count("<font")
        if font_count > 5:
            return "SGML-wrapped (legacy)"
        return "SGML-wrapped (unknown)"

    # Inline XBRL
    has_ix_ns = "xmlns:ix=" in text_lower or "<ix:header" in text_lower
    if re.search(r'<P STYLE="[^"]*font-family:Times New Roman"', text) and re.search(
        r'<Center><DIV STYLE="width:8\.5in"', text
    ):
        return "Donnelley Financial Solutions"
    if title_text:
        title_lower = title_text.lower()
        if "workiva" in title_lower or "wdesk" in title_lower:
            return "Workiva"

    if has_ix_ns:
        if "field: set; name: xdx" in text_lower:
            return "EFiling XDX"
        if "<!-- field: rule" in text_lower:
            return "Broadridge PROfile"
        if "<!-- field:" in text_lower[:5000]:
            return "EFiling/EDGAR Agent"
        basename = os.path.basename(filepath)
        accession_prefix = basename.split("-")[0]
        if accession_prefix in FILING_AGENT_CIKS:
            return FILING_AGENT_CIKS[accession_prefix]
        if '<?xml version="1.0" encoding="utf-8"' in text_lower[:200]:
            return "Inline XBRL (utf-8 toolchain)"
        if "<?xml version='1.0' encoding='ascii'?>" in text_lower[:200]:
            return "Inline XBRL (SEC/EDGAR standard)"
        return "Inline XBRL (tool unresolved)"

    # Structural fallbacks
    font_count = text_lower.count("<font")
    td_count = text_lower.count("<td")
    span_count = text_lower.count("<span")

    if font_count > 20:
        return "Legacy generator (font-based)"
    if td_count > 50 and span_count < 10:
        return "Table-based generator"
    data_attr_count = len(re.findall(r"\bdata-\w+", text_lower))
    if data_attr_count > 10:
        return "Modern web tooling"
    return "Unknown"


# ── Consolidate to ~14 families ──

FAMILY_MAP = {
    "Workiva": "Workiva",
    "Donnelley Financial Solutions": "Donnelley Financial Solutions",
    "Toppan Merrill": "Toppan Merrill",
    "CompSci Transform": "CompSci Transform",
    "ThunderDome": "ThunderDome",
    "EFiling/EDGAR Agent": "EFiling/EDGAR Agent",
    "EFiling XDX": "EFiling/EDGAR Agent",
    "Broadridge PROfile": "Broadridge PROfile",
    "SEC Publisher": "SEC Publisher",
    "IRIS Carbon": "IRIS Carbon",
    "RDG Portal": "RDG Portal",
    "Certent": "Certent",
    "PDF to EDGAR": "PDF to EDGAR",
    "GoXBRL": "GoXBRL",
    "Microsoft Word": "Microsoft Word",
    "Microsoft Excel": "Microsoft Excel",
    "Inline XBRL (SEC/EDGAR standard)": "Inline XBRL (unattributed)",
    "Inline XBRL (utf-8 toolchain)": "Inline XBRL (unattributed)",
    "Inline XBRL (tool unresolved)": "Inline XBRL (unattributed)",
    "SGML-wrapped (legacy)": "SGML-wrapped (unattributed)",
    "SGML-wrapped (unknown)": "SGML-wrapped (unattributed)",
    "Legacy generator (font-based)": "Other/Legacy",
    "Table-based generator": "Other/Legacy",
    "Modern web tooling": "Other/Legacy",
    "Unknown": "Unknown",
}


# ── Quality metric helpers ──

# Common non-heading start words to exclude from title-case detection
NON_HEADING_STARTS = {
    "we", "our", "the", "in", "a", "an", "as", "to", "on", "at", "by",
    "for", "it", "is", "if", "or", "no", "so", "do", "its", "this",
    "that", "with", "from", "has", "had", "have", "will", "may", "can",
    "all", "any", "are", "was", "were", "been", "not", "but", "each",
    "such", "these", "those", "also", "when", "there", "their",
    "they", "them", "than", "who", "what", "how", "where",
}

# Section name fragments for Item 1C
SECTION_KEYWORDS = [
    "risk management", "board oversight", "governance", "incident",
    "strategy", "third party", "management role", "cybersecurity",
    "risk factors", "material", "overview",
]

RE_ALLCAPS_HEADER = re.compile(r"^[A-Z][A-Z\s,&\-]{10,}[a-z]")

def is_inlined_header(text: str) -> bool:
    """Check if paragraph starts with an inlined heading pattern."""
    # ALL-CAPS header followed by body text
    if RE_ALLCAPS_HEADER.match(text):
        return True

    # Title-case heading: 2+ consecutive capitalized words at start (not common sentence starters)
    words = text.split()
    if len(words) < 4:
        return False
    cap_count = 0
    for w in words:
        clean = w.strip(".,;:!?()\"'")
        if not clean:
            continue
        if clean[0].isupper() and clean.lower() not in NON_HEADING_STARTS:
            cap_count += 1
        else:
            break
    if cap_count >= 2:
        # Check rest of text continues as a sentence (not just a short title)
        remaining = " ".join(words[cap_count:])
        if len(remaining) > 20:
            return True

    # Section keyword match at start
    text_lower = text[:80].lower()
    for kw in SECTION_KEYWORDS:
        if text_lower.startswith(kw):
            # Must have more text after the heading
            if len(text) > len(kw) + 10:
                return True

    return False


def is_orphan_word(text: str) -> bool:
    """Check if paragraph starts with lowercase (excluding list patterns)."""
    if not text:
        return False
    first_char = text[0]
    if not first_char.islower():
        return False
    # Exclude list pattern starters
    list_starters = ["and ", "or ", "including ", "i.e.", "e.g."]
    text_lower = text[:15].lower()
    for starter in list_starters:
        if text_lower.startswith(starter):
            return False
    # Exclude bullet-like patterns
    if text[0] in "•·-–—":
        return False
    return True


RE_TERMINAL = re.compile(r'[.!?;")]\s*$')

def is_truncated(text: str) -> bool:
    """Paragraph NOT ending with terminal punctuation."""
    return not RE_TERMINAL.search(text)


def is_fragment(text: str) -> bool:
    return len(text.split()) < 25


def main():
    # ── Step 1: Detect generators for all HTML files ──
    print("Step 1: Detecting generators for all HTML files...", file=sys.stderr)
    accession_to_generator = {}
    files = sorted(HTML_DIR.glob("*.html"))
    for i, fp in enumerate(files):
        accession = fp.stem
        gen_raw = detect_generator(str(fp))
        gen_family = FAMILY_MAP.get(gen_raw, gen_raw)
        accession_to_generator[accession] = gen_family
        if (i + 1) % 3000 == 0:
            print(f"  {i+1}/{len(files)} files processed...", file=sys.stderr)
    print(f"  Done: {len(files)} files, {len(set(accession_to_generator.values()))} generator families", file=sys.stderr)

    # ── Step 2: Load paragraphs and compute per-filing stats ──
    print("Step 2: Loading paragraphs...", file=sys.stderr)

    # Per-filing data
    filing_paragraphs = defaultdict(list)  # accession -> list of paragraph dicts
    text_hash_counts = Counter()  # textHash -> count of filings containing it

    # First pass: collect all textHashes and their filing counts
    text_hash_filings = defaultdict(set)  # textHash -> set of accessions
    all_paragraphs = []

    with open(PARAGRAPHS_FILE) as f:
        for line in f:
            p = json.loads(line)
            acc = p["filing"]["accessionNumber"]
            all_paragraphs.append(p)
            filing_paragraphs[acc].append(p)
            text_hash_filings[p["textHash"]].add(acc)

    print(f"  {len(all_paragraphs)} paragraphs across {len(filing_paragraphs)} filings", file=sys.stderr)

    # Boilerplate: textHash appearing in 3+ filings
    boilerplate_hashes = {h for h, accs in text_hash_filings.items() if len(accs) >= 3}
    print(f"  {len(boilerplate_hashes)} boilerplate hashes (in 3+ filings)", file=sys.stderr)

    # ── Step 3: Compute metrics per generator ──
    print("Step 3: Computing metrics...", file=sys.stderr)

    # Per-generator aggregate
    gen_stats = defaultdict(lambda: {
        "total_paragraphs": 0,
        "total_filings": 0,
        "paragraphs_per_filing": [],
        "word_counts": [],
        "inlined_header": 0,
        "orphan_word": 0,
        "fragment": 0,
        "truncated": 0,
        "boilerplate": 0,
    })

    # Per-filing issue rates for "most problematic" analysis
    filing_issue_rates = {}  # accession -> {metrics..., combined_rate}

    # Filings not in HTML dir (no generator detected)
    missing_gen = 0

    for acc, paragraphs in filing_paragraphs.items():
        gen = accession_to_generator.get(acc)
        if gen is None:
            missing_gen += 1
            gen = "(no HTML file)"

        stats = gen_stats[gen]
        stats["total_filings"] += 1
        stats["total_paragraphs"] += len(paragraphs)
        stats["paragraphs_per_filing"].append(len(paragraphs))

        # Per-filing counters for issue rate
        f_inlined = 0
        f_orphan = 0
        f_fragment = 0
        f_truncated = 0
        f_boilerplate = 0

        for p in paragraphs:
            text = p["text"]
            wc = p.get("wordCount", len(text.split()))
            stats["word_counts"].append(wc)

            if is_inlined_header(text):
                stats["inlined_header"] += 1
                f_inlined += 1
            if is_orphan_word(text):
                stats["orphan_word"] += 1
                f_orphan += 1
            if is_fragment(text):
                stats["fragment"] += 1
                f_fragment += 1
            if is_truncated(text):
                stats["truncated"] += 1
                f_truncated += 1
            if p["textHash"] in boilerplate_hashes:
                stats["boilerplate"] += 1
                f_boilerplate += 1

        n = len(paragraphs)
        if n > 0:
            filing_issue_rates[acc] = {
                "generator": gen,
                "n_paragraphs": n,
                "inlined_header_rate": f_inlined / n,
                "orphan_word_rate": f_orphan / n,
                "fragment_rate": f_fragment / n,
                "truncation_rate": f_truncated / n,
                "boilerplate_rate": f_boilerplate / n,
                "combined_rate": (f_inlined + f_orphan + f_fragment + f_truncated + f_boilerplate) / (5 * n),
            }

    if missing_gen:
        print(f"  Note: {missing_gen} filings had no matching HTML file", file=sys.stderr)

    # ── Step 4: Output ──

    # Compute corpus-wide averages for flagging
    corpus_total = sum(s["total_paragraphs"] for s in gen_stats.values())
    corpus_inlined = sum(s["inlined_header"] for s in gen_stats.values())
    corpus_orphan = sum(s["orphan_word"] for s in gen_stats.values())
    corpus_fragment = sum(s["fragment"] for s in gen_stats.values())
    corpus_truncated = sum(s["truncated"] for s in gen_stats.values())
    corpus_boilerplate = sum(s["boilerplate"] for s in gen_stats.values())
    corpus_avg_wc = statistics.mean(
        wc for s in gen_stats.values() for wc in s["word_counts"]
    ) if corpus_total > 0 else 0

    avg_rates = {
        "inlined_header": corpus_inlined / corpus_total if corpus_total else 0,
        "orphan_word": corpus_orphan / corpus_total if corpus_total else 0,
        "fragment": corpus_fragment / corpus_total if corpus_total else 0,
        "truncated": corpus_truncated / corpus_total if corpus_total else 0,
        "boilerplate": corpus_boilerplate / corpus_total if corpus_total else 0,
    }

    print()
    print("=" * 180)
    print("GENERATOR QUALITY CROSS-REFERENCE: SEC-cyBERT CORPUS")
    print("=" * 180)
    print(f"\nCorpus totals: {corpus_total:,} paragraphs across {sum(s['total_filings'] for s in gen_stats.values()):,} filings")
    print(f"Corpus averages: InlinedHdr={avg_rates['inlined_header']:.1%}  Orphan={avg_rates['orphan_word']:.1%}  "
          f"Fragment={avg_rates['fragment']:.1%}  Truncated={avg_rates['truncated']:.1%}  "
          f"Boilerplate={avg_rates['boilerplate']:.1%}  AvgWC={corpus_avg_wc:.1f}")
    print(f"(Cells marked with ** are >2x the corpus average)")

    # Sort by total paragraphs descending
    sorted_gens = sorted(gen_stats.items(), key=lambda x: x[1]["total_paragraphs"], reverse=True)

    # Header
    print()
    hdr = (
        f"{'Generator':<35} {'Files':>6} {'Paras':>7} {'Mean/F':>7} {'Med/F':>6} "
        f"{'AvgWC':>6} {'InlHdr%':>8} {'Orphan%':>8} {'Frag%':>8} {'Trunc%':>8} {'Boiler%':>8}"
    )
    print(hdr)
    print("-" * len(hdr))

    for gen, s in sorted_gens:
        n = s["total_paragraphs"]
        if n == 0:
            continue
        nf = s["total_filings"]
        mean_ppf = n / nf if nf else 0
        med_ppf = statistics.median(s["paragraphs_per_filing"]) if s["paragraphs_per_filing"] else 0
        avg_wc = statistics.mean(s["word_counts"]) if s["word_counts"] else 0

        inl_r = s["inlined_header"] / n
        orp_r = s["orphan_word"] / n
        fra_r = s["fragment"] / n
        tru_r = s["truncated"] / n
        boi_r = s["boilerplate"] / n

        # Flag if >2x corpus average
        def fmt_rate(val, avg_key):
            pct = f"{val:.1%}"
            if avg_rates[avg_key] > 0 and val > 2 * avg_rates[avg_key]:
                return f"{pct:>6}**"
            return f"{pct:>8}"

        row = (
            f"{gen:<35} {nf:>6} {n:>7} {mean_ppf:>7.1f} {med_ppf:>6.0f} "
            f"{avg_wc:>6.1f} {fmt_rate(inl_r, 'inlined_header')} {fmt_rate(orp_r, 'orphan_word')} "
            f"{fmt_rate(fra_r, 'fragment')} {fmt_rate(tru_r, 'truncated')} {fmt_rate(boi_r, 'boilerplate')}"
        )
        print(row)

    print("-" * len(hdr))

    # Corpus average row
    corpus_med_ppf = statistics.median(
        ppf for s in gen_stats.values() for ppf in s["paragraphs_per_filing"]
    )
    corpus_mean_ppf = corpus_total / sum(s["total_filings"] for s in gen_stats.values())
    print(
        f"{'CORPUS AVERAGE':<35} "
        f"{sum(s['total_filings'] for s in gen_stats.values()):>6} "
        f"{corpus_total:>7} "
        f"{corpus_mean_ppf:>7.1f} {corpus_med_ppf:>6.0f} "
        f"{corpus_avg_wc:>6.1f} "
        f"{avg_rates['inlined_header']:>7.1%} "
        f"{avg_rates['orphan_word']:>7.1%}  "
        f"{avg_rates['fragment']:>7.1%} "
        f"{avg_rates['truncated']:>7.1%} "
        f"{avg_rates['boilerplate']:>7.1%}"
    )

    # ── 10 Most Problematic Filings ──
    print()
    print("=" * 180)
    print("10 MOST PROBLEMATIC FILINGS (highest combined issue rate across all 5 metrics)")
    print("=" * 180)

    # Only consider filings with at least 3 paragraphs to avoid noisy tiny filings
    eligible = {acc: fr for acc, fr in filing_issue_rates.items() if fr["n_paragraphs"] >= 3}
    worst = sorted(eligible.items(), key=lambda x: x[1]["combined_rate"], reverse=True)[:10]

    print()
    hdr2 = (
        f"{'Accession':<30} {'Generator':<35} {'Paras':>5} "
        f"{'InlHdr':>7} {'Orphan':>7} {'Frag':>7} {'Trunc':>7} {'Boiler':>7} {'Combined':>8}"
    )
    print(hdr2)
    print("-" * len(hdr2))
    for acc, fr in worst:
        print(
            f"{acc:<30} {fr['generator']:<35} {fr['n_paragraphs']:>5} "
            f"{fr['inlined_header_rate']:>6.1%} {fr['orphan_word_rate']:>6.1%} "
            f"{fr['fragment_rate']:>6.1%} {fr['truncation_rate']:>6.1%} "
            f"{fr['boilerplate_rate']:>6.1%} {fr['combined_rate']:>7.1%}"
        )

    # ── Per-metric worst generators summary ──
    print()
    print("=" * 180)
    print("GENERATORS >2x CORPUS AVERAGE (flagged metrics)")
    print("=" * 180)

    metric_names = {
        "inlined_header": "Inlined Header",
        "orphan_word": "Orphan Word",
        "fragment": "Fragment",
        "truncated": "Truncation",
        "boilerplate": "Boilerplate",
    }

    for metric_key, metric_label in metric_names.items():
        flagged = []
        for gen, s in sorted_gens:
            n = s["total_paragraphs"]
            if n < 10:
                continue
            rate = s[metric_key] / n
            if avg_rates[metric_key] > 0 and rate > 2 * avg_rates[metric_key]:
                flagged.append((gen, rate, s[metric_key], n))
        if flagged:
            print(f"\n  {metric_label} rate (corpus avg: {avg_rates[metric_key]:.1%}, threshold >2x = {2*avg_rates[metric_key]:.1%}):")
            for gen, rate, count, total in sorted(flagged, key=lambda x: -x[1]):
                print(f"    {gen:<35} {rate:.1%}  ({count}/{total})")
        else:
            print(f"\n  {metric_label}: No generators >2x corpus average")


if __name__ == "__main__":
    main()