""" Heading candidate detection in SEC-cyBERT paragraph data. Searches for inlined section headings that previous passes missed. READ-ONLY: does not modify data, prints analysis to stdout. """ import json import re from collections import Counter, defaultdict from pathlib import Path DATA_PATH = Path(__file__).resolve().parent.parent / "data" / "paragraphs" / "paragraphs-clean.jsonl" # ── Load data ────────────────────────────────────────────────────────────────── print(f"Loading data from {DATA_PATH} ...") paragraphs = [] with open(DATA_PATH) as f: for line in f: paragraphs.append(json.loads(line)) print(f"Loaded {len(paragraphs):,} paragraphs.\n") # ── Helpers ──────────────────────────────────────────────────────────────────── def preview(text: str, n: int = 150) -> str: """First n chars, single-line.""" return text[:n].replace("\n", " ").strip() COMMON_SENTENCE_STARTERS = { "we", "our", "the", "a", "an", "as", "in", "on", "to", "for", "if", "this", "these", "that", "those", "it", "its", "such", "no", "not", "with", "from", "at", "by", "or", "and", "all", "any", "each", "while", "when", "where", "although", "because", "since", "after", "before", "during", "under", "over", "between", "through", "into", "upon", "about", "there", "here", "however", "additionally", "furthermore", "moreover", "also", "finally", "similarly", "accordingly", "consequently", "therefore", "thus", "nonetheless", "notwithstanding", "specifically", "generally", "currently", "recently", "historically", "collectively", "certain", } HEADING_KEYWORDS = { "oversight", "framework", "assessment", "compliance", "integration", "governance", "strategy", "management", "disclosure", "reporting", "response", "recovery", "prevention", "detection", "monitoring", "awareness", "training", "policy", "policies", "procedures", "controls", "cybersecurity", "information", "security", "risk", "board", "committee", "audit", "technology", "infrastructure", "incident", "incidents", "threat", "threats", "vulnerability", "program", "processes", "overview", "background", "introduction", "summary", "conclusion", "material", "materiality", } HEADING_GERUNDS = { "protecting", "monitoring", "assessing", "managing", "overseeing", "implementing", "establishing", "maintaining", "identifying", "evaluating", "mitigating", "addressing", "enhancing", "ensuring", "integrating", "reporting", "disclosing", "detecting", "preventing", "responding", "recovering", "training", "educating", "reviewing", "governing", "supervising", "coordinating", "leveraging", "strengthening", "safeguarding", "securing", } SEPARATOR_LINE = "=" * 100 def print_section(title: str): print(f"\n{SEPARATOR_LINE}") print(f" {title}") print(SEPARATOR_LINE) # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 1: First-sentence grammatical analysis # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 1: First-clause looks like a heading (title case prefix → sentence body)") # Pattern: first N words are in title case, then a transition to normal # sentence text. E.g. "Risk Management and Strategy We have..." approach1_hits = [] for p in paragraphs: text = p["text"].strip() words = text.split() if len(words) < 6: continue # Find the transition point: where title-case words stop title_words = 0 for w in words: # Strip punctuation for checking clean = re.sub(r"[^a-zA-Z]", "", w) if not clean: title_words += 1 continue # "and", "of", "the", "for", "in", "on" can be lowercase in titles if clean.lower() in {"and", "of", "the", "for", "in", "on", "a", "an", "or", "to", "by", "with"}: title_words += 1 continue if clean[0].isupper(): title_words += 1 else: break # We want 3+ title-case words at the start, then a transition if title_words >= 3 and title_words < len(words) - 2: # Check that the word after the title block starts lowercase (sentence body) rest_start = words[title_words] if title_words < len(words) else "" rest_clean = re.sub(r"[^a-zA-Z]", "", rest_start) if rest_clean and rest_clean[0].islower(): heading_part = " ".join(words[:title_words]) # Skip if heading part is just common sentence starters if heading_part.lower().split()[0] not in COMMON_SENTENCE_STARTERS: approach1_hits.append({ "id": p["id"], "heading_words": title_words, "heading": heading_part, "preview": preview(text), }) # Count heading patterns heading_counter = Counter(h["heading"] for h in approach1_hits) print(f"\nFound {len(approach1_hits):,} paragraphs with title-case prefix → lowercase body.") print(f"Unique heading prefixes: {len(heading_counter):,}") print(f"\nTOP 30 most common heading prefixes:") for heading, count in heading_counter.most_common(30): # Find an example ex = next(h for h in approach1_hits if h["heading"] == heading) print(f" [{count:4d}x] \"{heading}\"") print(f" Example: {ex['preview']}") print(f"\nSample of UNIQUE (1x) heading prefixes (first 30):") unique_headings = [(h, ex) for h, ex in ((h, next(x for x in approach1_hits if x["heading"] == h)) for h in heading_counter if heading_counter[h] == 1)] for heading, ex in unique_headings[:30]: print(f" \"{heading}\"") print(f" → {ex['preview']}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 2: Capitalization anomalies # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 2: Capitalization anomalies") # 2a: ALL CAPS at start allcaps_hits = [] for p in paragraphs: text = p["text"].strip() words = text.split() if len(words) < 4: continue # Check first 3+ words are ALL CAPS caps_count = 0 for w in words: clean = re.sub(r"[^a-zA-Z]", "", w) if not clean: caps_count += 1 continue if clean.isupper() and len(clean) > 1: caps_count += 1 else: break if caps_count >= 3: allcaps_hits.append({ "id": p["id"], "caps_words": caps_count, "preview": preview(text), }) print(f"\n2a. ALL CAPS for first 3+ words: {len(allcaps_hits):,} paragraphs") for h in allcaps_hits[:20]: print(f" [{h['caps_words']} caps words] {h['preview']}") # 2b: First word is capitalized but NOT a common sentence starter # and looks like a heading keyword heading_start_hits = [] for p in paragraphs: text = p["text"].strip() words = text.split() if len(words) < 4: continue first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower() if first_word in HEADING_KEYWORDS and first_word not in COMMON_SENTENCE_STARTERS: heading_start_hits.append({ "id": p["id"], "first_word": first_word, "preview": preview(text), }) heading_start_counter = Counter(h["first_word"] for h in heading_start_hits) print(f"\n2b. First word is a heading keyword (not a sentence starter): {len(heading_start_hits):,} paragraphs") print("Breakdown by keyword:") for kw, count in heading_start_counter.most_common(30): ex = next(h for h in heading_start_hits if h["first_word"] == kw) print(f" [{count:4d}x] \"{kw}\" — {ex['preview']}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 3: Separator patterns # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 3: Separator patterns (heading followed by separator then sentence)") separator_patterns = { "period": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\.\s+([A-Z][a-z])"), "dash/em-dash": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\s*[–—-]\s*([A-Z][a-z])"), "semicolon": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60});\s*([A-Z][a-z])"), "double space": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\s{2,}([A-Z][a-z])"), "colon": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60}):\s*([A-Z][a-z])"), "parenthetical prefix": re.compile(r"^\([a-z0-9ivx]+\)\s*([A-Z][A-Za-z\s,&]{3,60})\s+([a-z])"), "bullet/pipe prefix": re.compile(r"^[•●■▪◦‣|]\s*([A-Z][A-Za-z\s,&]{3,60})\s+([a-z])"), "tab separator": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\t+(.+)"), } for sep_name, pattern in separator_patterns.items(): hits = [] for p in paragraphs: text = p["text"].strip() m = pattern.match(text) if m: heading_candidate = m.group(1).strip() if m.lastindex >= 1 else "" # Filter: heading should have at least 2 words if len(heading_candidate.split()) >= 2: hits.append({ "id": p["id"], "heading": heading_candidate, "preview": preview(text), }) heading_counts = Counter(h["heading"] for h in hits) print(f"\n Separator: {sep_name} — {len(hits):,} hits") if hits: for heading, count in heading_counts.most_common(20): ex = next(h for h in hits if h["heading"] == heading) print(f" [{count:4d}x] \"{heading}\"") print(f" {ex['preview']}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 4: Repeated first-3-words analysis # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 4: Repeated first-3-word phrases") first3_counter = Counter() first3_examples = {} for p in paragraphs: text = p["text"].strip() words = text.split() if len(words) < 4: continue first3 = " ".join(words[:3]) first3_counter[first3] += 1 if first3 not in first3_examples: first3_examples[first3] = preview(text) # Filter to phrases appearing 3+ times that look heading-like # (not common sentence starters) common_starts = { "we have implemented", "we have established", "we have adopted", "we have not", "we do not", "we are not", "we believe that", "we use a", "we rely on", "we have a", "we also have", "our board of", "the board of", "the company has", "the audit committee", "in addition to", "as part of", "as a result", "in the event", "as of the", "in accordance with", "with respect to", } print(f"\nFirst-3-word phrases appearing 5+ times (excluding common sentence starts):") for phrase, count in first3_counter.most_common(200): if count < 5: break if phrase.lower() in common_starts: continue # Check if it looks heading-like: title case or contains heading keywords words_lower = phrase.lower().split() is_heading_like = ( all(w[0].isupper() or w in {"and", "of", "the", "for", "in", "on", "a", "or", "to"} for w in phrase.split() if re.sub(r"[^a-zA-Z]", "", w)) and words_lower[0] not in COMMON_SENTENCE_STARTERS ) label = " [HEADING-LIKE]" if is_heading_like else "" print(f" [{count:4d}x] \"{phrase}\"{label}") print(f" {first3_examples[phrase]}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 5: Cross-paragraph heading detection (short para → sentence para) # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 5: Cross-paragraph heading detection (standalone short headings)") # Group paragraphs by accession number, sorted by index by_filing = defaultdict(list) for p in paragraphs: acc = p["filing"]["accessionNumber"] by_filing[acc].append(p) for acc in by_filing: by_filing[acc].sort(key=lambda x: x["paragraphIndex"]) standalone_headings = [] for acc, pars in by_filing.items(): for i in range(len(pars) - 1): curr = pars[i] nxt = pars[i + 1] curr_text = curr["text"].strip() curr_words = curr_text.split() nxt_text = nxt["text"].strip() # Current paragraph is short (< 10 words) if len(curr_words) > 10 or len(curr_words) < 2: continue # Current paragraph looks like a heading: # - Title case or all caps # - No period at end (headings rarely end with period) # - Not a single common word if curr_text.endswith(".") and not curr_text.endswith("etc."): continue # Check title-case-ish alpha_words = [w for w in curr_words if re.sub(r"[^a-zA-Z]", "", w)] if not alpha_words: continue title_case_ratio = sum( 1 for w in alpha_words if re.sub(r"[^a-zA-Z]", "", w)[0].isupper() or re.sub(r"[^a-zA-Z]", "", w).lower() in {"and", "of", "the", "for", "in", "on", "a", "or", "to", "by", "with"} ) / len(alpha_words) if title_case_ratio < 0.8: continue # Next paragraph should start with a sentence (lowercase second word or common starter) nxt_words = nxt_text.split() if len(nxt_words) < 3: continue standalone_headings.append({ "id": curr["id"], "heading_text": curr_text, "next_preview": preview(nxt_text), "accession": acc, "company": curr["filing"]["companyName"], }) heading_text_counter = Counter(h["heading_text"] for h in standalone_headings) print(f"\nFound {len(standalone_headings):,} potential standalone heading paragraphs.") print(f"Unique heading texts: {len(heading_text_counter):,}") print(f"\nTOP 30 most common standalone headings:") for heading, count in heading_text_counter.most_common(30): ex = next(h for h in standalone_headings if h["heading_text"] == heading) print(f" [{count:4d}x] \"{heading}\"") print(f" Next para: {ex['next_preview']}") print(f"\nSample of UNIQUE standalone headings (first 30):") unique_standalone = [h for h in standalone_headings if heading_text_counter[h["heading_text"]] == 1] for h in unique_standalone[:30]: print(f" \"{h['heading_text']}\" ({h['company']})") print(f" Next: {h['next_preview']}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 6: Unusual word patterns at paragraph start # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 6: Unusual starting words (gerunds, heading nouns)") # 6a: Gerunds at start gerund_hits = [] for p in paragraphs: text = p["text"].strip() words = text.split() if len(words) < 4: continue first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower() if first_word.endswith("ing") and len(first_word) > 4: if first_word in HEADING_GERUNDS or first_word not in COMMON_SENTENCE_STARTERS: gerund_hits.append({ "id": p["id"], "first_word": first_word, "preview": preview(text), }) gerund_counter = Counter(h["first_word"] for h in gerund_hits) print(f"\n6a. Paragraphs starting with gerunds: {len(gerund_hits):,}") print("TOP 20 gerunds:") for word, count in gerund_counter.most_common(20): ex = next(h for h in gerund_hits if h["first_word"] == word) print(f" [{count:4d}x] \"{word}\" — {ex['preview']}") # 6b: Heading nouns at start (already covered in 2b, but let's look at # multi-word patterns starting with heading nouns) noun_phrase_hits = [] for p in paragraphs: text = p["text"].strip() words = text.split() if len(words) < 4: continue first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower() if first_word in HEADING_KEYWORDS: # Check if the first 2-3 words form a heading-like phrase first_few = " ".join(words[:min(4, len(words))]) noun_phrase_hits.append({ "id": p["id"], "first_few": first_few, "preview": preview(text), }) noun_counter = Counter(h["first_few"] for h in noun_phrase_hits) print(f"\n6b. Paragraphs starting with heading keyword nouns: {len(noun_phrase_hits):,}") print("TOP 20 opening phrases:") for phrase, count in noun_counter.most_common(20): ex = next(h for h in noun_phrase_hits if h["first_few"] == phrase) print(f" [{count:4d}x] \"{phrase}\" — {ex['preview']}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 7: Numbers/letters at start (list items / numbered headings) # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 7: Numbered/lettered items at paragraph start") numbered_patterns = { "roman_paren": re.compile(r"^\((?:i{1,3}|iv|v|vi{0,3}|ix|x)\)\s"), "letter_paren": re.compile(r"^\([a-z]\)\s"), "number_paren": re.compile(r"^\(\d+\)\s"), "number_dot": re.compile(r"^\d+\.\s"), "letter_dot": re.compile(r"^[a-z]\.\s"), "roman_dot": re.compile(r"^(?:i{1,3}|iv|v|vi{0,3}|ix|x)\.\s"), "bullet_chars": re.compile(r"^[•●■▪◦‣►▸→·]\s"), "dash_bullet": re.compile(r"^[-–—]\s+[A-Z]"), } for pattern_name, pattern in numbered_patterns.items(): hits = [] for p in paragraphs: text = p["text"].strip() if pattern.match(text): hits.append({ "id": p["id"], "preview": preview(text), "wordCount": p["wordCount"], }) print(f"\n Pattern: {pattern_name} — {len(hits):,} hits") if hits: # Show word count distribution short = sum(1 for h in hits if h["wordCount"] < 15) medium = sum(1 for h in hits if 15 <= h["wordCount"] < 50) long = sum(1 for h in hits if h["wordCount"] >= 50) print(f" Length distribution: <15 words: {short}, 15-49: {medium}, 50+: {long}") print(f" Examples (first 10):") for h in hits[:10]: print(f" [{h['wordCount']:3d}w] {h['preview']}") # ══════════════════════════════════════════════════════════════════════════════ # APPROACH 8 (BONUS): Colon-separated inline headings deep dive # ══════════════════════════════════════════════════════════════════════════════ print_section("APPROACH 8 (BONUS): Known heading phrases appearing ANYWHERE in first sentence") # Check for known SEC 1C heading phrases appearing at the start of a paragraph # even if not perfectly title-cased known_heading_phrases = [ "risk management", "risk assessment", "risk factors", "governance", "board oversight", "board of directors", "incident response", "third party", "third-party", "cybersecurity program", "cybersecurity risk", "cybersecurity governance", "information security", "data protection", "data privacy", "security operations", "security awareness", "management oversight", "committee oversight", "risk management and strategy", "risk management, strategy", "material cybersecurity", "materiality assessment", "disclosure controls", ] phrase_hits = defaultdict(list) for p in paragraphs: text = p["text"].strip() # Only look at the first ~80 chars first_part = text[:80].lower() for phrase in known_heading_phrases: if first_part.startswith(phrase): phrase_hits[phrase].append({ "id": p["id"], "preview": preview(text), }) print(f"\nParagraphs starting with known heading phrases:") for phrase in sorted(phrase_hits.keys(), key=lambda x: -len(phrase_hits[x])): hits = phrase_hits[phrase] print(f"\n \"{phrase}\" — {len(hits)} hits") for h in hits[:5]: print(f" {h['preview']}") # ══════════════════════════════════════════════════════════════════════════════ # SUMMARY # ══════════════════════════════════════════════════════════════════════════════ print_section("SUMMARY") print(f""" Approach 1 (title-case prefix → body): {len(approach1_hits):,} hits Approach 2a (ALL CAPS start): {len(allcaps_hits):,} hits Approach 2b (heading keyword start): {len(heading_start_hits):,} hits Approach 3 (separator patterns): see above per-separator Approach 5 (standalone short headings): {len(standalone_headings):,} hits Approach 6a (gerund starts): {len(gerund_hits):,} hits Approach 6b (heading noun starts): {len(noun_phrase_hits):,} hits Approach 7 (numbered/lettered): see above per-pattern Approach 8 (known phrase starts): {sum(len(v) for v in phrase_hits.values()):,} hits """) print("Done.")