SEC-cyBERT/scripts/find_heading_candidates.py
2026-03-29 20:33:39 -04:00

512 lines
23 KiB
Python

"""
Heading candidate detection in SEC-cyBERT paragraph data.
Searches for inlined section headings that previous passes missed.
READ-ONLY: does not modify data, prints analysis to stdout.
"""
import json
import re
from collections import Counter, defaultdict
from pathlib import Path
DATA_PATH = Path(__file__).resolve().parent.parent / "data" / "paragraphs" / "paragraphs-clean.jsonl"
# ── Load data ──────────────────────────────────────────────────────────────────
print(f"Loading data from {DATA_PATH} ...")
paragraphs = []
with open(DATA_PATH) as f:
for line in f:
paragraphs.append(json.loads(line))
print(f"Loaded {len(paragraphs):,} paragraphs.\n")
# ── Helpers ────────────────────────────────────────────────────────────────────
def preview(text: str, n: int = 150) -> str:
"""First n chars, single-line."""
return text[:n].replace("\n", " ").strip()
COMMON_SENTENCE_STARTERS = {
"we", "our", "the", "a", "an", "as", "in", "on", "to", "for", "if",
"this", "these", "that", "those", "it", "its", "such", "no", "not",
"with", "from", "at", "by", "or", "and", "all", "any", "each",
"while", "when", "where", "although", "because", "since", "after",
"before", "during", "under", "over", "between", "through", "into",
"upon", "about", "there", "here", "however", "additionally",
"furthermore", "moreover", "also", "finally", "similarly",
"accordingly", "consequently", "therefore", "thus", "nonetheless",
"notwithstanding", "specifically", "generally", "currently",
"recently", "historically", "collectively", "certain",
}
HEADING_KEYWORDS = {
"oversight", "framework", "assessment", "compliance", "integration",
"governance", "strategy", "management", "disclosure", "reporting",
"response", "recovery", "prevention", "detection", "monitoring",
"awareness", "training", "policy", "policies", "procedures",
"controls", "cybersecurity", "information", "security", "risk",
"board", "committee", "audit", "technology", "infrastructure",
"incident", "incidents", "threat", "threats", "vulnerability",
"program", "processes", "overview", "background", "introduction",
"summary", "conclusion", "material", "materiality",
}
HEADING_GERUNDS = {
"protecting", "monitoring", "assessing", "managing", "overseeing",
"implementing", "establishing", "maintaining", "identifying",
"evaluating", "mitigating", "addressing", "enhancing", "ensuring",
"integrating", "reporting", "disclosing", "detecting", "preventing",
"responding", "recovering", "training", "educating", "reviewing",
"governing", "supervising", "coordinating", "leveraging",
"strengthening", "safeguarding", "securing",
}
SEPARATOR_LINE = "=" * 100
def print_section(title: str):
print(f"\n{SEPARATOR_LINE}")
print(f" {title}")
print(SEPARATOR_LINE)
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 1: First-sentence grammatical analysis
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 1: First-clause looks like a heading (title case prefix → sentence body)")
# Pattern: first N words are in title case, then a transition to normal
# sentence text. E.g. "Risk Management and Strategy We have..."
approach1_hits = []
for p in paragraphs:
text = p["text"].strip()
words = text.split()
if len(words) < 6:
continue
# Find the transition point: where title-case words stop
title_words = 0
for w in words:
# Strip punctuation for checking
clean = re.sub(r"[^a-zA-Z]", "", w)
if not clean:
title_words += 1
continue
# "and", "of", "the", "for", "in", "on" can be lowercase in titles
if clean.lower() in {"and", "of", "the", "for", "in", "on", "a", "an", "or", "to", "by", "with"}:
title_words += 1
continue
if clean[0].isupper():
title_words += 1
else:
break
# We want 3+ title-case words at the start, then a transition
if title_words >= 3 and title_words < len(words) - 2:
# Check that the word after the title block starts lowercase (sentence body)
rest_start = words[title_words] if title_words < len(words) else ""
rest_clean = re.sub(r"[^a-zA-Z]", "", rest_start)
if rest_clean and rest_clean[0].islower():
heading_part = " ".join(words[:title_words])
# Skip if heading part is just common sentence starters
if heading_part.lower().split()[0] not in COMMON_SENTENCE_STARTERS:
approach1_hits.append({
"id": p["id"],
"heading_words": title_words,
"heading": heading_part,
"preview": preview(text),
})
# Count heading patterns
heading_counter = Counter(h["heading"] for h in approach1_hits)
print(f"\nFound {len(approach1_hits):,} paragraphs with title-case prefix → lowercase body.")
print(f"Unique heading prefixes: {len(heading_counter):,}")
print(f"\nTOP 30 most common heading prefixes:")
for heading, count in heading_counter.most_common(30):
# Find an example
ex = next(h for h in approach1_hits if h["heading"] == heading)
print(f" [{count:4d}x] \"{heading}\"")
print(f" Example: {ex['preview']}")
print(f"\nSample of UNIQUE (1x) heading prefixes (first 30):")
unique_headings = [(h, ex) for h, ex in ((h, next(x for x in approach1_hits if x["heading"] == h)) for h in heading_counter if heading_counter[h] == 1)]
for heading, ex in unique_headings[:30]:
print(f" \"{heading}\"")
print(f"{ex['preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 2: Capitalization anomalies
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 2: Capitalization anomalies")
# 2a: ALL CAPS at start
allcaps_hits = []
for p in paragraphs:
text = p["text"].strip()
words = text.split()
if len(words) < 4:
continue
# Check first 3+ words are ALL CAPS
caps_count = 0
for w in words:
clean = re.sub(r"[^a-zA-Z]", "", w)
if not clean:
caps_count += 1
continue
if clean.isupper() and len(clean) > 1:
caps_count += 1
else:
break
if caps_count >= 3:
allcaps_hits.append({
"id": p["id"],
"caps_words": caps_count,
"preview": preview(text),
})
print(f"\n2a. ALL CAPS for first 3+ words: {len(allcaps_hits):,} paragraphs")
for h in allcaps_hits[:20]:
print(f" [{h['caps_words']} caps words] {h['preview']}")
# 2b: First word is capitalized but NOT a common sentence starter
# and looks like a heading keyword
heading_start_hits = []
for p in paragraphs:
text = p["text"].strip()
words = text.split()
if len(words) < 4:
continue
first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower()
if first_word in HEADING_KEYWORDS and first_word not in COMMON_SENTENCE_STARTERS:
heading_start_hits.append({
"id": p["id"],
"first_word": first_word,
"preview": preview(text),
})
heading_start_counter = Counter(h["first_word"] for h in heading_start_hits)
print(f"\n2b. First word is a heading keyword (not a sentence starter): {len(heading_start_hits):,} paragraphs")
print("Breakdown by keyword:")
for kw, count in heading_start_counter.most_common(30):
ex = next(h for h in heading_start_hits if h["first_word"] == kw)
print(f" [{count:4d}x] \"{kw}\"{ex['preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 3: Separator patterns
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 3: Separator patterns (heading followed by separator then sentence)")
separator_patterns = {
"period": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\.\s+([A-Z][a-z])"),
"dash/em-dash": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\s*[–—-]\s*([A-Z][a-z])"),
"semicolon": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60});\s*([A-Z][a-z])"),
"double space": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\s{2,}([A-Z][a-z])"),
"colon": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60}):\s*([A-Z][a-z])"),
"parenthetical prefix": re.compile(r"^\([a-z0-9ivx]+\)\s*([A-Z][A-Za-z\s,&]{3,60})\s+([a-z])"),
"bullet/pipe prefix": re.compile(r"^[•●■▪◦‣|]\s*([A-Z][A-Za-z\s,&]{3,60})\s+([a-z])"),
"tab separator": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\t+(.+)"),
}
for sep_name, pattern in separator_patterns.items():
hits = []
for p in paragraphs:
text = p["text"].strip()
m = pattern.match(text)
if m:
heading_candidate = m.group(1).strip() if m.lastindex >= 1 else ""
# Filter: heading should have at least 2 words
if len(heading_candidate.split()) >= 2:
hits.append({
"id": p["id"],
"heading": heading_candidate,
"preview": preview(text),
})
heading_counts = Counter(h["heading"] for h in hits)
print(f"\n Separator: {sep_name}{len(hits):,} hits")
if hits:
for heading, count in heading_counts.most_common(20):
ex = next(h for h in hits if h["heading"] == heading)
print(f" [{count:4d}x] \"{heading}\"")
print(f" {ex['preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 4: Repeated first-3-words analysis
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 4: Repeated first-3-word phrases")
first3_counter = Counter()
first3_examples = {}
for p in paragraphs:
text = p["text"].strip()
words = text.split()
if len(words) < 4:
continue
first3 = " ".join(words[:3])
first3_counter[first3] += 1
if first3 not in first3_examples:
first3_examples[first3] = preview(text)
# Filter to phrases appearing 3+ times that look heading-like
# (not common sentence starters)
common_starts = {
"we have implemented", "we have established", "we have adopted",
"we have not", "we do not", "we are not", "we believe that",
"we use a", "we rely on", "we have a", "we also have",
"our board of", "the board of", "the company has",
"the audit committee", "in addition to", "as part of",
"as a result", "in the event", "as of the",
"in accordance with", "with respect to",
}
print(f"\nFirst-3-word phrases appearing 5+ times (excluding common sentence starts):")
for phrase, count in first3_counter.most_common(200):
if count < 5:
break
if phrase.lower() in common_starts:
continue
# Check if it looks heading-like: title case or contains heading keywords
words_lower = phrase.lower().split()
is_heading_like = (
all(w[0].isupper() or w in {"and", "of", "the", "for", "in", "on", "a", "or", "to"}
for w in phrase.split() if re.sub(r"[^a-zA-Z]", "", w))
and words_lower[0] not in COMMON_SENTENCE_STARTERS
)
label = " [HEADING-LIKE]" if is_heading_like else ""
print(f" [{count:4d}x] \"{phrase}\"{label}")
print(f" {first3_examples[phrase]}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 5: Cross-paragraph heading detection (short para → sentence para)
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 5: Cross-paragraph heading detection (standalone short headings)")
# Group paragraphs by accession number, sorted by index
by_filing = defaultdict(list)
for p in paragraphs:
acc = p["filing"]["accessionNumber"]
by_filing[acc].append(p)
for acc in by_filing:
by_filing[acc].sort(key=lambda x: x["paragraphIndex"])
standalone_headings = []
for acc, pars in by_filing.items():
for i in range(len(pars) - 1):
curr = pars[i]
nxt = pars[i + 1]
curr_text = curr["text"].strip()
curr_words = curr_text.split()
nxt_text = nxt["text"].strip()
# Current paragraph is short (< 10 words)
if len(curr_words) > 10 or len(curr_words) < 2:
continue
# Current paragraph looks like a heading:
# - Title case or all caps
# - No period at end (headings rarely end with period)
# - Not a single common word
if curr_text.endswith(".") and not curr_text.endswith("etc."):
continue
# Check title-case-ish
alpha_words = [w for w in curr_words if re.sub(r"[^a-zA-Z]", "", w)]
if not alpha_words:
continue
title_case_ratio = sum(
1 for w in alpha_words
if re.sub(r"[^a-zA-Z]", "", w)[0].isupper()
or re.sub(r"[^a-zA-Z]", "", w).lower() in {"and", "of", "the", "for", "in", "on", "a", "or", "to", "by", "with"}
) / len(alpha_words)
if title_case_ratio < 0.8:
continue
# Next paragraph should start with a sentence (lowercase second word or common starter)
nxt_words = nxt_text.split()
if len(nxt_words) < 3:
continue
standalone_headings.append({
"id": curr["id"],
"heading_text": curr_text,
"next_preview": preview(nxt_text),
"accession": acc,
"company": curr["filing"]["companyName"],
})
heading_text_counter = Counter(h["heading_text"] for h in standalone_headings)
print(f"\nFound {len(standalone_headings):,} potential standalone heading paragraphs.")
print(f"Unique heading texts: {len(heading_text_counter):,}")
print(f"\nTOP 30 most common standalone headings:")
for heading, count in heading_text_counter.most_common(30):
ex = next(h for h in standalone_headings if h["heading_text"] == heading)
print(f" [{count:4d}x] \"{heading}\"")
print(f" Next para: {ex['next_preview']}")
print(f"\nSample of UNIQUE standalone headings (first 30):")
unique_standalone = [h for h in standalone_headings if heading_text_counter[h["heading_text"]] == 1]
for h in unique_standalone[:30]:
print(f" \"{h['heading_text']}\" ({h['company']})")
print(f" Next: {h['next_preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 6: Unusual word patterns at paragraph start
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 6: Unusual starting words (gerunds, heading nouns)")
# 6a: Gerunds at start
gerund_hits = []
for p in paragraphs:
text = p["text"].strip()
words = text.split()
if len(words) < 4:
continue
first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower()
if first_word.endswith("ing") and len(first_word) > 4:
if first_word in HEADING_GERUNDS or first_word not in COMMON_SENTENCE_STARTERS:
gerund_hits.append({
"id": p["id"],
"first_word": first_word,
"preview": preview(text),
})
gerund_counter = Counter(h["first_word"] for h in gerund_hits)
print(f"\n6a. Paragraphs starting with gerunds: {len(gerund_hits):,}")
print("TOP 20 gerunds:")
for word, count in gerund_counter.most_common(20):
ex = next(h for h in gerund_hits if h["first_word"] == word)
print(f" [{count:4d}x] \"{word}\"{ex['preview']}")
# 6b: Heading nouns at start (already covered in 2b, but let's look at
# multi-word patterns starting with heading nouns)
noun_phrase_hits = []
for p in paragraphs:
text = p["text"].strip()
words = text.split()
if len(words) < 4:
continue
first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower()
if first_word in HEADING_KEYWORDS:
# Check if the first 2-3 words form a heading-like phrase
first_few = " ".join(words[:min(4, len(words))])
noun_phrase_hits.append({
"id": p["id"],
"first_few": first_few,
"preview": preview(text),
})
noun_counter = Counter(h["first_few"] for h in noun_phrase_hits)
print(f"\n6b. Paragraphs starting with heading keyword nouns: {len(noun_phrase_hits):,}")
print("TOP 20 opening phrases:")
for phrase, count in noun_counter.most_common(20):
ex = next(h for h in noun_phrase_hits if h["first_few"] == phrase)
print(f" [{count:4d}x] \"{phrase}\"{ex['preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 7: Numbers/letters at start (list items / numbered headings)
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 7: Numbered/lettered items at paragraph start")
numbered_patterns = {
"roman_paren": re.compile(r"^\((?:i{1,3}|iv|v|vi{0,3}|ix|x)\)\s"),
"letter_paren": re.compile(r"^\([a-z]\)\s"),
"number_paren": re.compile(r"^\(\d+\)\s"),
"number_dot": re.compile(r"^\d+\.\s"),
"letter_dot": re.compile(r"^[a-z]\.\s"),
"roman_dot": re.compile(r"^(?:i{1,3}|iv|v|vi{0,3}|ix|x)\.\s"),
"bullet_chars": re.compile(r"^[•●■▪◦‣►▸→·]\s"),
"dash_bullet": re.compile(r"^[-–—]\s+[A-Z]"),
}
for pattern_name, pattern in numbered_patterns.items():
hits = []
for p in paragraphs:
text = p["text"].strip()
if pattern.match(text):
hits.append({
"id": p["id"],
"preview": preview(text),
"wordCount": p["wordCount"],
})
print(f"\n Pattern: {pattern_name}{len(hits):,} hits")
if hits:
# Show word count distribution
short = sum(1 for h in hits if h["wordCount"] < 15)
medium = sum(1 for h in hits if 15 <= h["wordCount"] < 50)
long = sum(1 for h in hits if h["wordCount"] >= 50)
print(f" Length distribution: <15 words: {short}, 15-49: {medium}, 50+: {long}")
print(f" Examples (first 10):")
for h in hits[:10]:
print(f" [{h['wordCount']:3d}w] {h['preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# APPROACH 8 (BONUS): Colon-separated inline headings deep dive
# ══════════════════════════════════════════════════════════════════════════════
print_section("APPROACH 8 (BONUS): Known heading phrases appearing ANYWHERE in first sentence")
# Check for known SEC 1C heading phrases appearing at the start of a paragraph
# even if not perfectly title-cased
known_heading_phrases = [
"risk management", "risk assessment", "risk factors",
"governance", "board oversight", "board of directors",
"incident response", "third party", "third-party",
"cybersecurity program", "cybersecurity risk", "cybersecurity governance",
"information security", "data protection", "data privacy",
"security operations", "security awareness",
"management oversight", "committee oversight",
"risk management and strategy", "risk management, strategy",
"material cybersecurity", "materiality assessment",
"disclosure controls",
]
phrase_hits = defaultdict(list)
for p in paragraphs:
text = p["text"].strip()
# Only look at the first ~80 chars
first_part = text[:80].lower()
for phrase in known_heading_phrases:
if first_part.startswith(phrase):
phrase_hits[phrase].append({
"id": p["id"],
"preview": preview(text),
})
print(f"\nParagraphs starting with known heading phrases:")
for phrase in sorted(phrase_hits.keys(), key=lambda x: -len(phrase_hits[x])):
hits = phrase_hits[phrase]
print(f"\n \"{phrase}\"{len(hits)} hits")
for h in hits[:5]:
print(f" {h['preview']}")
# ══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════════════════
print_section("SUMMARY")
print(f"""
Approach 1 (title-case prefix → body): {len(approach1_hits):,} hits
Approach 2a (ALL CAPS start): {len(allcaps_hits):,} hits
Approach 2b (heading keyword start): {len(heading_start_hits):,} hits
Approach 3 (separator patterns): see above per-separator
Approach 5 (standalone short headings): {len(standalone_headings):,} hits
Approach 6a (gerund starts): {len(gerund_hits):,} hits
Approach 6b (heading noun starts): {len(noun_phrase_hits):,} hits
Approach 7 (numbered/lettered): see above per-pattern
Approach 8 (known phrase starts): {sum(len(v) for v in phrase_hits.values()):,} hits
""")
print("Done.")