SEC-cyBERT/scripts/analyze_generator_quality.py

#!/usr/bin/env python3
"""
Quantify how EFiling/XDX generator quality issues affect the annotated paragraph set.
READ-ONLY analysis — does not modify any files.
"""

import json
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path

# Reuse detect_generator from the existing script
sys.path.insert(0, str(Path(__file__).parent))
from detect_generators import detect_generator

# Paths
HTML_DIR = Path("/home/joey/Documents/sec-cyBERT/data/raw/html")
PARAGRAPHS_PATH = Path("/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl")
ANNOTATIONS_PATH = Path("/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl")

SEP = "=" * 100


def load_paragraphs():
    """Load paragraphs, return dict: id -> paragraph dict."""
    paragraphs = {}
    with open(PARAGRAPHS_PATH) as f:
        for line in f:
            p = json.loads(line)
            paragraphs[p["id"]] = p
    return paragraphs


def load_annotations():
    """Load annotations, return dict: paragraphId -> annotation dict."""
    annotations = {}
    with open(ANNOTATIONS_PATH) as f:
        for line in f:
            a = json.loads(line)
            pid = a["paragraphId"]
            # Keep the first annotation per paragraph (or overwrite — doesn't matter for counts)
            annotations[pid] = a
    return annotations


def detect_all_generators():
    """Detect generators for all HTML files. Return dict: accession -> generator."""
    accession_to_gen = {}
    files = sorted(HTML_DIR.glob("*.html"))
    total = len(files)
    for i, fp in enumerate(files):
        accession = fp.stem
        gen, _evidence = detect_generator(str(fp))
        accession_to_gen[accession] = gen
        if (i + 1) % 3000 == 0:
            print(f"  Scanned {i + 1}/{total} HTML files...", file=sys.stderr)
    print(f"  Scanned {total}/{total} HTML files.", file=sys.stderr)
    return accession_to_gen


def starts_lowercase(text: str) -> bool:
    """True if text starts with a lowercase letter (orphan word candidate)."""
    if not text:
        return False
    return text[0].islower()


def is_list_item(text: str) -> bool:
    """True if text looks like a list item (starts with bullet, dash, number+period, etc.)."""
    stripped = text.strip()
    if not stripped:
        return False
    # Common list patterns: "- ", "• ", "* ", "1. ", "a) ", "(a) ", "(i) "
    if re.match(r'^[-•*▪◦]\s', stripped):
        return True
    if re.match(r'^\d+[.)]\s', stripped):
        return True
    if re.match(r'^\([a-z0-9ivx]+\)\s', stripped, re.I):
        return True
    if re.match(r'^[a-z][.)]\s', stripped):
        return True
    return False


def looks_like_inlined_header(text: str) -> bool:
    """
    True if text starts with a section heading run into body text, e.g.:
    "Risk Management and Strategy We recognize the importance..."
    "Cybersecurity Governance Our Board of Directors oversees..."

    Key distinction from normal sentences: the heading portion is a noun phrase
    (not a full sentence subject like "Our Board" or "The Company"), and is
    immediately followed by a new sentence that starts a different thought.

    We look for known SEC cybersecurity section heading patterns followed by
    body text starting with a capital letter (new sentence) with no punctuation
    separating them (no period, colon, or newline — just a space).
    """
    # Known heading patterns for SEC Item 1C disclosures
    heading_patterns = [
        r'(?:Cybersecurity\s+)?Risk\s+Management(?:\s+and\s+Strategy)?',
        r'(?:Cybersecurity\s+)?Governance(?:\s+and\s+Risk\s+Management)?',
        r'Cybersecurity\s+Governance',
        r'Cybersecurity\s+Risk\s+Management\s+and\s+Strategy',
        r'Board\s+Oversight(?:\s+of\s+(?:Risks?\s+from\s+)?Cybersecurity(?:\s+(?:Threats?|Risks?))?)?',
        r'Management(?:\'s)?\s+Role\s+in\s+(?:Managing\s+)?Cybersecurity',
        r'Governance\s+(?:Related\s+to|Oversight\s+of)\s+Cybersecurity(?:\s+Risks?)?',
        r'Impact\s+of\s+Cybersecurity\s+(?:Risks?|Threats?)',
        r'Cybersecurity\s+(?:Strategy|Overview|Program)',
        r'(?:Management\s+and|Management|Governance)\s+(?:Strategy|Overview)',
        r'Risk\s+Factors?',
        r'Oversight\s+of\s+Cybersecurity\s+Risk\s+Management',
    ]

    for pat in heading_patterns:
        # Heading immediately followed by body text (capital letter starting new sentence)
        m = re.match(rf'^({pat})\s+([A-Z])', text)
        if m:
            return True
        # Also catch heading followed by lowercase (rarer but possible)
        m = re.match(rf'^({pat})\s+([a-z])', text)
        if m:
            return True

    return False


def main():
    print("Loading data...")
    paragraphs = load_paragraphs()
    annotations = load_annotations()
    print(f"  Paragraphs: {len(paragraphs):,}")
    print(f"  Annotations: {len(annotations):,}")

    # Unique annotated paragraph IDs
    annotated_ids = set(annotations.keys()) & set(paragraphs.keys())
    print(f"  Annotated paragraphs with matching paragraph data: {len(annotated_ids):,}")

    print("\nDetecting generators for all HTML files...")
    accession_to_gen = detect_all_generators()
    print(f"  HTML files scanned: {len(accession_to_gen):,}")

    # Map each paragraph to its generator
    para_to_gen = {}
    missing_accessions = set()
    for pid, p in paragraphs.items():
        acc = p["filing"]["accessionNumber"]
        gen = accession_to_gen.get(acc)
        if gen is None:
            missing_accessions.add(acc)
            gen = "NO_HTML_FILE"
        para_to_gen[pid] = gen

    if missing_accessions:
        print(f"\n  WARNING: {len(missing_accessions)} accession numbers in paragraphs have no HTML file")

    # =====================================================================
    # SECTION 1: Annotated paragraphs by generator
    # =====================================================================
    print(f"\n{SEP}")
    print("SECTION 1: Annotated paragraphs by generator")
    print(SEP)

    ann_gen_counts = Counter()
    for pid in annotated_ids:
        ann_gen_counts[para_to_gen[pid]] += 1

    total_ann = len(annotated_ids)
    print(f"\n{'Generator':<50} {'Count':>7} {'%':>7}")
    print("-" * 70)
    for gen, count in ann_gen_counts.most_common():
        pct = count / total_ann * 100
        print(f"{gen:<50} {count:>7} {pct:>6.1f}%")
    print("-" * 70)
    print(f"{'TOTAL':<50} {total_ann:>7} {100.0:>6.1f}%")

    # =====================================================================
    # SECTION 2: Lowercase-start (orphan word) analysis for annotated set
    # =====================================================================
    print(f"\n{SEP}")
    print("SECTION 2: Lowercase-start paragraphs in annotated set")
    print(SEP)

    # All annotated lowercase-start
    ann_lc = {pid for pid in annotated_ids if starts_lowercase(paragraphs[pid]["text"])}
    ann_lc_nonlist = {pid for pid in ann_lc if not is_list_item(paragraphs[pid]["text"])}

    print(f"\nAnnotated paragraphs starting with lowercase: {len(ann_lc):,} / {total_ann:,} ({len(ann_lc)/total_ann*100:.2f}%)")
    print(f"  Of those, excluding list items: {len(ann_lc_nonlist):,} ({len(ann_lc_nonlist)/total_ann*100:.2f}%)")

    # Breakdown by generator for lowercase-start non-list
    lc_by_gen = Counter()
    for pid in ann_lc_nonlist:
        lc_by_gen[para_to_gen[pid]] += 1

    print(f"\n{'Generator':<50} {'LC-start':>9} {'Total ann':>10} {'% of gen':>9}")
    print("-" * 85)
    for gen, _ in ann_gen_counts.most_common():
        lc_count = lc_by_gen.get(gen, 0)
        gen_total = ann_gen_counts[gen]
        pct = lc_count / gen_total * 100 if gen_total else 0
        if lc_count > 0:
            print(f"{gen:<50} {lc_count:>9} {gen_total:>10} {pct:>8.1f}%")

    # Specific callouts
    efiling_gens = {"EFiling/EDGAR Agent", "EFiling XDX"}
    efiling_ann = {pid for pid in annotated_ids if para_to_gen[pid] in efiling_gens}
    efiling_lc = {pid for pid in ann_lc_nonlist if para_to_gen[pid] in efiling_gens}

    compsci_ann = {pid for pid in annotated_ids if para_to_gen[pid] == "CompSci Transform"}
    compsci_lc = {pid for pid in ann_lc_nonlist if para_to_gen[pid] == "CompSci Transform"}

    print(f"\n--- Specific callouts ---")
    print(f"EFiling/XDX annotated paragraphs starting lowercase (non-list): {len(efiling_lc):,} / {len(efiling_ann):,} ({len(efiling_lc)/len(efiling_ann)*100:.1f}% of EFiling/XDX)" if efiling_ann else "EFiling/XDX: 0 annotated paragraphs")
    print(f"CompSci Transform annotated paragraphs starting lowercase (non-list): {len(compsci_lc):,} / {len(compsci_ann):,} ({len(compsci_lc)/len(compsci_ann)*100:.1f}% of CompSci)" if compsci_ann else "CompSci Transform: 0 annotated paragraphs")
    print(f"\nTotal affected annotated paragraphs (LC non-list): {len(ann_lc_nonlist):,} / {total_ann:,} = {len(ann_lc_nonlist)/total_ann*100:.2f}%")

    # =====================================================================
    # SECTION 3: Orphan-word paragraphs detail
    # =====================================================================
    print(f"\n{SEP}")
    print("SECTION 3: Orphan-word paragraph details (LC-start, non-list, annotated)")
    print(SEP)

    # Breakdown by generator
    print(f"\nBreakdown by generator:")
    print(f"{'Generator':<50} {'Count':>7} {'% of orphan':>12}")
    print("-" * 75)
    for gen, count in lc_by_gen.most_common():
        pct = count / len(ann_lc_nonlist) * 100
        print(f"{gen:<50} {count:>7} {pct:>11.1f}%")

    # 10 example texts with labels
    print(f"\n10 example orphan-word annotated paragraphs:")
    print("-" * 100)
    examples = sorted(ann_lc_nonlist)[:10]
    for pid in examples:
        text = paragraphs[pid]["text"][:150]
        ann = annotations[pid]
        label = ann.get("label", {})
        cat = label.get("content_category", "?")
        spec = label.get("specificity_level", "?")
        gen = para_to_gen[pid]
        print(f"  [{gen}] cat={cat}, spec={spec}")
        print(f"    \"{text}...\"")
        print()

    # Category distribution in orphan-word paragraphs vs overall
    print(f"\nCategory distribution: orphan-word vs overall annotated set")
    print("-" * 80)

    orphan_cats = Counter()
    for pid in ann_lc_nonlist:
        cat = annotations[pid].get("label", {}).get("content_category", "Unknown")
        orphan_cats[cat] += 1

    overall_cats = Counter()
    for pid in annotated_ids:
        cat = annotations[pid].get("label", {}).get("content_category", "Unknown")
        overall_cats[cat] += 1

    all_cats = sorted(set(orphan_cats.keys()) | set(overall_cats.keys()))
    print(f"{'Category':<40} {'Orphan':>7} {'Orphan%':>8} {'Overall':>8} {'Overall%':>9} {'Over-rep':>9}")
    print("-" * 85)
    for cat in all_cats:
        o_count = orphan_cats.get(cat, 0)
        a_count = overall_cats.get(cat, 0)
        o_pct = o_count / len(ann_lc_nonlist) * 100 if ann_lc_nonlist else 0
        a_pct = a_count / total_ann * 100
        ratio = (o_pct / a_pct) if a_pct > 0 else 0
        flag = " <<<" if ratio > 1.5 else ""
        print(f"{cat:<40} {o_count:>7} {o_pct:>7.1f}% {a_count:>8} {a_pct:>8.1f}% {ratio:>8.2f}x{flag}")

    # =====================================================================
    # SECTION 4: Inlined headers analysis
    # =====================================================================
    print(f"\n{SEP}")
    print("SECTION 4: Inlined headers in annotated paragraphs")
    print(SEP)

    ann_inlined = set()
    for pid in annotated_ids:
        text = paragraphs[pid]["text"]
        if looks_like_inlined_header(text):
            ann_inlined.add(pid)

    print(f"\nAnnotated paragraphs with inlined headers: {len(ann_inlined):,} / {total_ann:,} ({len(ann_inlined)/total_ann*100:.2f}%)")

    inlined_by_gen = Counter()
    for pid in ann_inlined:
        inlined_by_gen[para_to_gen[pid]] += 1

    print(f"\n{'Generator':<50} {'Inlined':>8} {'Total ann':>10} {'% of gen':>9}")
    print("-" * 85)
    for gen, _ in ann_gen_counts.most_common():
        ih_count = inlined_by_gen.get(gen, 0)
        gen_total = ann_gen_counts[gen]
        pct = ih_count / gen_total * 100 if gen_total else 0
        if ih_count > 0:
            print(f"{gen:<50} {ih_count:>8} {gen_total:>10} {pct:>8.1f}%")

    # Show some examples
    print(f"\n10 example inlined-header paragraphs:")
    print("-" * 100)
    examples_ih = sorted(ann_inlined)[:10]
    for pid in examples_ih:
        text = paragraphs[pid]["text"][:150]
        gen = para_to_gen[pid]
        cat = annotations[pid].get("label", {}).get("content_category", "?")
        print(f"  [{gen}] cat={cat}")
        print(f"    \"{text}...\"")
        print()

    # =====================================================================
    # SECTION 5: Combined impact summary
    # =====================================================================
    print(f"\n{SEP}")
    print("SECTION 5: Combined impact summary")
    print(SEP)

    affected = ann_lc_nonlist | ann_inlined
    print(f"\nOrphan-word (LC non-list):        {len(ann_lc_nonlist):>6} ({len(ann_lc_nonlist)/total_ann*100:.2f}%)")
    print(f"Inlined headers:                  {len(ann_inlined):>6} ({len(ann_inlined)/total_ann*100:.2f}%)")
    print(f"Either issue (union):             {len(affected):>6} ({len(affected)/total_ann*100:.2f}%)")
    print(f"Total annotated set:              {total_ann:>6}")

    # EFiling/XDX specifically
    efiling_affected = {pid for pid in affected if para_to_gen[pid] in efiling_gens}
    print(f"\nEFiling/XDX affected (either issue): {len(efiling_affected):,} / {len(efiling_ann):,}")


if __name__ == "__main__":
    main()