512 lines
23 KiB
Python
512 lines
23 KiB
Python
"""
|
|
Heading candidate detection in SEC-cyBERT paragraph data.
|
|
Searches for inlined section headings that previous passes missed.
|
|
READ-ONLY: does not modify data, prints analysis to stdout.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
DATA_PATH = Path(__file__).resolve().parent.parent / "data" / "paragraphs" / "paragraphs-clean.jsonl"
|
|
|
|
# ── Load data ──────────────────────────────────────────────────────────────────
|
|
|
|
print(f"Loading data from {DATA_PATH} ...")
|
|
paragraphs = []
|
|
with open(DATA_PATH) as f:
|
|
for line in f:
|
|
paragraphs.append(json.loads(line))
|
|
print(f"Loaded {len(paragraphs):,} paragraphs.\n")
|
|
|
|
# ── Helpers ────────────────────────────────────────────────────────────────────
|
|
|
|
def preview(text: str, n: int = 150) -> str:
|
|
"""First n chars, single-line."""
|
|
return text[:n].replace("\n", " ").strip()
|
|
|
|
COMMON_SENTENCE_STARTERS = {
|
|
"we", "our", "the", "a", "an", "as", "in", "on", "to", "for", "if",
|
|
"this", "these", "that", "those", "it", "its", "such", "no", "not",
|
|
"with", "from", "at", "by", "or", "and", "all", "any", "each",
|
|
"while", "when", "where", "although", "because", "since", "after",
|
|
"before", "during", "under", "over", "between", "through", "into",
|
|
"upon", "about", "there", "here", "however", "additionally",
|
|
"furthermore", "moreover", "also", "finally", "similarly",
|
|
"accordingly", "consequently", "therefore", "thus", "nonetheless",
|
|
"notwithstanding", "specifically", "generally", "currently",
|
|
"recently", "historically", "collectively", "certain",
|
|
}
|
|
|
|
HEADING_KEYWORDS = {
|
|
"oversight", "framework", "assessment", "compliance", "integration",
|
|
"governance", "strategy", "management", "disclosure", "reporting",
|
|
"response", "recovery", "prevention", "detection", "monitoring",
|
|
"awareness", "training", "policy", "policies", "procedures",
|
|
"controls", "cybersecurity", "information", "security", "risk",
|
|
"board", "committee", "audit", "technology", "infrastructure",
|
|
"incident", "incidents", "threat", "threats", "vulnerability",
|
|
"program", "processes", "overview", "background", "introduction",
|
|
"summary", "conclusion", "material", "materiality",
|
|
}
|
|
|
|
HEADING_GERUNDS = {
|
|
"protecting", "monitoring", "assessing", "managing", "overseeing",
|
|
"implementing", "establishing", "maintaining", "identifying",
|
|
"evaluating", "mitigating", "addressing", "enhancing", "ensuring",
|
|
"integrating", "reporting", "disclosing", "detecting", "preventing",
|
|
"responding", "recovering", "training", "educating", "reviewing",
|
|
"governing", "supervising", "coordinating", "leveraging",
|
|
"strengthening", "safeguarding", "securing",
|
|
}
|
|
|
|
SEPARATOR_LINE = "=" * 100
|
|
|
|
|
|
def print_section(title: str):
|
|
print(f"\n{SEPARATOR_LINE}")
|
|
print(f" {title}")
|
|
print(SEPARATOR_LINE)
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 1: First-sentence grammatical analysis
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 1: First-clause looks like a heading (title case prefix → sentence body)")
|
|
|
|
# Pattern: first N words are in title case, then a transition to normal
|
|
# sentence text. E.g. "Risk Management and Strategy We have..."
|
|
approach1_hits = []
|
|
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
words = text.split()
|
|
if len(words) < 6:
|
|
continue
|
|
|
|
# Find the transition point: where title-case words stop
|
|
title_words = 0
|
|
for w in words:
|
|
# Strip punctuation for checking
|
|
clean = re.sub(r"[^a-zA-Z]", "", w)
|
|
if not clean:
|
|
title_words += 1
|
|
continue
|
|
# "and", "of", "the", "for", "in", "on" can be lowercase in titles
|
|
if clean.lower() in {"and", "of", "the", "for", "in", "on", "a", "an", "or", "to", "by", "with"}:
|
|
title_words += 1
|
|
continue
|
|
if clean[0].isupper():
|
|
title_words += 1
|
|
else:
|
|
break
|
|
|
|
# We want 3+ title-case words at the start, then a transition
|
|
if title_words >= 3 and title_words < len(words) - 2:
|
|
# Check that the word after the title block starts lowercase (sentence body)
|
|
rest_start = words[title_words] if title_words < len(words) else ""
|
|
rest_clean = re.sub(r"[^a-zA-Z]", "", rest_start)
|
|
if rest_clean and rest_clean[0].islower():
|
|
heading_part = " ".join(words[:title_words])
|
|
# Skip if heading part is just common sentence starters
|
|
if heading_part.lower().split()[0] not in COMMON_SENTENCE_STARTERS:
|
|
approach1_hits.append({
|
|
"id": p["id"],
|
|
"heading_words": title_words,
|
|
"heading": heading_part,
|
|
"preview": preview(text),
|
|
})
|
|
|
|
# Count heading patterns
|
|
heading_counter = Counter(h["heading"] for h in approach1_hits)
|
|
print(f"\nFound {len(approach1_hits):,} paragraphs with title-case prefix → lowercase body.")
|
|
print(f"Unique heading prefixes: {len(heading_counter):,}")
|
|
print(f"\nTOP 30 most common heading prefixes:")
|
|
for heading, count in heading_counter.most_common(30):
|
|
# Find an example
|
|
ex = next(h for h in approach1_hits if h["heading"] == heading)
|
|
print(f" [{count:4d}x] \"{heading}\"")
|
|
print(f" Example: {ex['preview']}")
|
|
|
|
print(f"\nSample of UNIQUE (1x) heading prefixes (first 30):")
|
|
unique_headings = [(h, ex) for h, ex in ((h, next(x for x in approach1_hits if x["heading"] == h)) for h in heading_counter if heading_counter[h] == 1)]
|
|
for heading, ex in unique_headings[:30]:
|
|
print(f" \"{heading}\"")
|
|
print(f" → {ex['preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 2: Capitalization anomalies
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 2: Capitalization anomalies")
|
|
|
|
# 2a: ALL CAPS at start
|
|
allcaps_hits = []
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
words = text.split()
|
|
if len(words) < 4:
|
|
continue
|
|
# Check first 3+ words are ALL CAPS
|
|
caps_count = 0
|
|
for w in words:
|
|
clean = re.sub(r"[^a-zA-Z]", "", w)
|
|
if not clean:
|
|
caps_count += 1
|
|
continue
|
|
if clean.isupper() and len(clean) > 1:
|
|
caps_count += 1
|
|
else:
|
|
break
|
|
if caps_count >= 3:
|
|
allcaps_hits.append({
|
|
"id": p["id"],
|
|
"caps_words": caps_count,
|
|
"preview": preview(text),
|
|
})
|
|
|
|
print(f"\n2a. ALL CAPS for first 3+ words: {len(allcaps_hits):,} paragraphs")
|
|
for h in allcaps_hits[:20]:
|
|
print(f" [{h['caps_words']} caps words] {h['preview']}")
|
|
|
|
# 2b: First word is capitalized but NOT a common sentence starter
|
|
# and looks like a heading keyword
|
|
heading_start_hits = []
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
words = text.split()
|
|
if len(words) < 4:
|
|
continue
|
|
first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower()
|
|
if first_word in HEADING_KEYWORDS and first_word not in COMMON_SENTENCE_STARTERS:
|
|
heading_start_hits.append({
|
|
"id": p["id"],
|
|
"first_word": first_word,
|
|
"preview": preview(text),
|
|
})
|
|
|
|
heading_start_counter = Counter(h["first_word"] for h in heading_start_hits)
|
|
print(f"\n2b. First word is a heading keyword (not a sentence starter): {len(heading_start_hits):,} paragraphs")
|
|
print("Breakdown by keyword:")
|
|
for kw, count in heading_start_counter.most_common(30):
|
|
ex = next(h for h in heading_start_hits if h["first_word"] == kw)
|
|
print(f" [{count:4d}x] \"{kw}\" — {ex['preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 3: Separator patterns
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 3: Separator patterns (heading followed by separator then sentence)")
|
|
|
|
separator_patterns = {
|
|
"period": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\.\s+([A-Z][a-z])"),
|
|
"dash/em-dash": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\s*[–—-]\s*([A-Z][a-z])"),
|
|
"semicolon": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60});\s*([A-Z][a-z])"),
|
|
"double space": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\s{2,}([A-Z][a-z])"),
|
|
"colon": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60}):\s*([A-Z][a-z])"),
|
|
"parenthetical prefix": re.compile(r"^\([a-z0-9ivx]+\)\s*([A-Z][A-Za-z\s,&]{3,60})\s+([a-z])"),
|
|
"bullet/pipe prefix": re.compile(r"^[•●■▪◦‣|]\s*([A-Z][A-Za-z\s,&]{3,60})\s+([a-z])"),
|
|
"tab separator": re.compile(r"^([A-Z][A-Za-z\s,&]{3,60})\t+(.+)"),
|
|
}
|
|
|
|
for sep_name, pattern in separator_patterns.items():
|
|
hits = []
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
m = pattern.match(text)
|
|
if m:
|
|
heading_candidate = m.group(1).strip() if m.lastindex >= 1 else ""
|
|
# Filter: heading should have at least 2 words
|
|
if len(heading_candidate.split()) >= 2:
|
|
hits.append({
|
|
"id": p["id"],
|
|
"heading": heading_candidate,
|
|
"preview": preview(text),
|
|
})
|
|
|
|
heading_counts = Counter(h["heading"] for h in hits)
|
|
print(f"\n Separator: {sep_name} — {len(hits):,} hits")
|
|
if hits:
|
|
for heading, count in heading_counts.most_common(20):
|
|
ex = next(h for h in hits if h["heading"] == heading)
|
|
print(f" [{count:4d}x] \"{heading}\"")
|
|
print(f" {ex['preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 4: Repeated first-3-words analysis
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 4: Repeated first-3-word phrases")
|
|
|
|
first3_counter = Counter()
|
|
first3_examples = {}
|
|
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
words = text.split()
|
|
if len(words) < 4:
|
|
continue
|
|
first3 = " ".join(words[:3])
|
|
first3_counter[first3] += 1
|
|
if first3 not in first3_examples:
|
|
first3_examples[first3] = preview(text)
|
|
|
|
# Filter to phrases appearing 3+ times that look heading-like
|
|
# (not common sentence starters)
|
|
common_starts = {
|
|
"we have implemented", "we have established", "we have adopted",
|
|
"we have not", "we do not", "we are not", "we believe that",
|
|
"we use a", "we rely on", "we have a", "we also have",
|
|
"our board of", "the board of", "the company has",
|
|
"the audit committee", "in addition to", "as part of",
|
|
"as a result", "in the event", "as of the",
|
|
"in accordance with", "with respect to",
|
|
}
|
|
|
|
print(f"\nFirst-3-word phrases appearing 5+ times (excluding common sentence starts):")
|
|
for phrase, count in first3_counter.most_common(200):
|
|
if count < 5:
|
|
break
|
|
if phrase.lower() in common_starts:
|
|
continue
|
|
# Check if it looks heading-like: title case or contains heading keywords
|
|
words_lower = phrase.lower().split()
|
|
is_heading_like = (
|
|
all(w[0].isupper() or w in {"and", "of", "the", "for", "in", "on", "a", "or", "to"}
|
|
for w in phrase.split() if re.sub(r"[^a-zA-Z]", "", w))
|
|
and words_lower[0] not in COMMON_SENTENCE_STARTERS
|
|
)
|
|
label = " [HEADING-LIKE]" if is_heading_like else ""
|
|
print(f" [{count:4d}x] \"{phrase}\"{label}")
|
|
print(f" {first3_examples[phrase]}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 5: Cross-paragraph heading detection (short para → sentence para)
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 5: Cross-paragraph heading detection (standalone short headings)")
|
|
|
|
# Group paragraphs by accession number, sorted by index
|
|
by_filing = defaultdict(list)
|
|
for p in paragraphs:
|
|
acc = p["filing"]["accessionNumber"]
|
|
by_filing[acc].append(p)
|
|
|
|
for acc in by_filing:
|
|
by_filing[acc].sort(key=lambda x: x["paragraphIndex"])
|
|
|
|
standalone_headings = []
|
|
for acc, pars in by_filing.items():
|
|
for i in range(len(pars) - 1):
|
|
curr = pars[i]
|
|
nxt = pars[i + 1]
|
|
curr_text = curr["text"].strip()
|
|
curr_words = curr_text.split()
|
|
nxt_text = nxt["text"].strip()
|
|
|
|
# Current paragraph is short (< 10 words)
|
|
if len(curr_words) > 10 or len(curr_words) < 2:
|
|
continue
|
|
|
|
# Current paragraph looks like a heading:
|
|
# - Title case or all caps
|
|
# - No period at end (headings rarely end with period)
|
|
# - Not a single common word
|
|
if curr_text.endswith(".") and not curr_text.endswith("etc."):
|
|
continue
|
|
|
|
# Check title-case-ish
|
|
alpha_words = [w for w in curr_words if re.sub(r"[^a-zA-Z]", "", w)]
|
|
if not alpha_words:
|
|
continue
|
|
|
|
title_case_ratio = sum(
|
|
1 for w in alpha_words
|
|
if re.sub(r"[^a-zA-Z]", "", w)[0].isupper()
|
|
or re.sub(r"[^a-zA-Z]", "", w).lower() in {"and", "of", "the", "for", "in", "on", "a", "or", "to", "by", "with"}
|
|
) / len(alpha_words)
|
|
|
|
if title_case_ratio < 0.8:
|
|
continue
|
|
|
|
# Next paragraph should start with a sentence (lowercase second word or common starter)
|
|
nxt_words = nxt_text.split()
|
|
if len(nxt_words) < 3:
|
|
continue
|
|
|
|
standalone_headings.append({
|
|
"id": curr["id"],
|
|
"heading_text": curr_text,
|
|
"next_preview": preview(nxt_text),
|
|
"accession": acc,
|
|
"company": curr["filing"]["companyName"],
|
|
})
|
|
|
|
heading_text_counter = Counter(h["heading_text"] for h in standalone_headings)
|
|
print(f"\nFound {len(standalone_headings):,} potential standalone heading paragraphs.")
|
|
print(f"Unique heading texts: {len(heading_text_counter):,}")
|
|
print(f"\nTOP 30 most common standalone headings:")
|
|
for heading, count in heading_text_counter.most_common(30):
|
|
ex = next(h for h in standalone_headings if h["heading_text"] == heading)
|
|
print(f" [{count:4d}x] \"{heading}\"")
|
|
print(f" Next para: {ex['next_preview']}")
|
|
|
|
print(f"\nSample of UNIQUE standalone headings (first 30):")
|
|
unique_standalone = [h for h in standalone_headings if heading_text_counter[h["heading_text"]] == 1]
|
|
for h in unique_standalone[:30]:
|
|
print(f" \"{h['heading_text']}\" ({h['company']})")
|
|
print(f" Next: {h['next_preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 6: Unusual word patterns at paragraph start
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 6: Unusual starting words (gerunds, heading nouns)")
|
|
|
|
# 6a: Gerunds at start
|
|
gerund_hits = []
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
words = text.split()
|
|
if len(words) < 4:
|
|
continue
|
|
first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower()
|
|
if first_word.endswith("ing") and len(first_word) > 4:
|
|
if first_word in HEADING_GERUNDS or first_word not in COMMON_SENTENCE_STARTERS:
|
|
gerund_hits.append({
|
|
"id": p["id"],
|
|
"first_word": first_word,
|
|
"preview": preview(text),
|
|
})
|
|
|
|
gerund_counter = Counter(h["first_word"] for h in gerund_hits)
|
|
print(f"\n6a. Paragraphs starting with gerunds: {len(gerund_hits):,}")
|
|
print("TOP 20 gerunds:")
|
|
for word, count in gerund_counter.most_common(20):
|
|
ex = next(h for h in gerund_hits if h["first_word"] == word)
|
|
print(f" [{count:4d}x] \"{word}\" — {ex['preview']}")
|
|
|
|
# 6b: Heading nouns at start (already covered in 2b, but let's look at
|
|
# multi-word patterns starting with heading nouns)
|
|
noun_phrase_hits = []
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
words = text.split()
|
|
if len(words) < 4:
|
|
continue
|
|
first_word = re.sub(r"[^a-zA-Z]", "", words[0]).lower()
|
|
if first_word in HEADING_KEYWORDS:
|
|
# Check if the first 2-3 words form a heading-like phrase
|
|
first_few = " ".join(words[:min(4, len(words))])
|
|
noun_phrase_hits.append({
|
|
"id": p["id"],
|
|
"first_few": first_few,
|
|
"preview": preview(text),
|
|
})
|
|
|
|
noun_counter = Counter(h["first_few"] for h in noun_phrase_hits)
|
|
print(f"\n6b. Paragraphs starting with heading keyword nouns: {len(noun_phrase_hits):,}")
|
|
print("TOP 20 opening phrases:")
|
|
for phrase, count in noun_counter.most_common(20):
|
|
ex = next(h for h in noun_phrase_hits if h["first_few"] == phrase)
|
|
print(f" [{count:4d}x] \"{phrase}\" — {ex['preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 7: Numbers/letters at start (list items / numbered headings)
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 7: Numbered/lettered items at paragraph start")
|
|
|
|
numbered_patterns = {
|
|
"roman_paren": re.compile(r"^\((?:i{1,3}|iv|v|vi{0,3}|ix|x)\)\s"),
|
|
"letter_paren": re.compile(r"^\([a-z]\)\s"),
|
|
"number_paren": re.compile(r"^\(\d+\)\s"),
|
|
"number_dot": re.compile(r"^\d+\.\s"),
|
|
"letter_dot": re.compile(r"^[a-z]\.\s"),
|
|
"roman_dot": re.compile(r"^(?:i{1,3}|iv|v|vi{0,3}|ix|x)\.\s"),
|
|
"bullet_chars": re.compile(r"^[•●■▪◦‣►▸→·]\s"),
|
|
"dash_bullet": re.compile(r"^[-–—]\s+[A-Z]"),
|
|
}
|
|
|
|
for pattern_name, pattern in numbered_patterns.items():
|
|
hits = []
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
if pattern.match(text):
|
|
hits.append({
|
|
"id": p["id"],
|
|
"preview": preview(text),
|
|
"wordCount": p["wordCount"],
|
|
})
|
|
|
|
print(f"\n Pattern: {pattern_name} — {len(hits):,} hits")
|
|
if hits:
|
|
# Show word count distribution
|
|
short = sum(1 for h in hits if h["wordCount"] < 15)
|
|
medium = sum(1 for h in hits if 15 <= h["wordCount"] < 50)
|
|
long = sum(1 for h in hits if h["wordCount"] >= 50)
|
|
print(f" Length distribution: <15 words: {short}, 15-49: {medium}, 50+: {long}")
|
|
print(f" Examples (first 10):")
|
|
for h in hits[:10]:
|
|
print(f" [{h['wordCount']:3d}w] {h['preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# APPROACH 8 (BONUS): Colon-separated inline headings deep dive
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("APPROACH 8 (BONUS): Known heading phrases appearing ANYWHERE in first sentence")
|
|
|
|
# Check for known SEC 1C heading phrases appearing at the start of a paragraph
|
|
# even if not perfectly title-cased
|
|
known_heading_phrases = [
|
|
"risk management", "risk assessment", "risk factors",
|
|
"governance", "board oversight", "board of directors",
|
|
"incident response", "third party", "third-party",
|
|
"cybersecurity program", "cybersecurity risk", "cybersecurity governance",
|
|
"information security", "data protection", "data privacy",
|
|
"security operations", "security awareness",
|
|
"management oversight", "committee oversight",
|
|
"risk management and strategy", "risk management, strategy",
|
|
"material cybersecurity", "materiality assessment",
|
|
"disclosure controls",
|
|
]
|
|
|
|
phrase_hits = defaultdict(list)
|
|
for p in paragraphs:
|
|
text = p["text"].strip()
|
|
# Only look at the first ~80 chars
|
|
first_part = text[:80].lower()
|
|
for phrase in known_heading_phrases:
|
|
if first_part.startswith(phrase):
|
|
phrase_hits[phrase].append({
|
|
"id": p["id"],
|
|
"preview": preview(text),
|
|
})
|
|
|
|
print(f"\nParagraphs starting with known heading phrases:")
|
|
for phrase in sorted(phrase_hits.keys(), key=lambda x: -len(phrase_hits[x])):
|
|
hits = phrase_hits[phrase]
|
|
print(f"\n \"{phrase}\" — {len(hits)} hits")
|
|
for h in hits[:5]:
|
|
print(f" {h['preview']}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SUMMARY
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
print_section("SUMMARY")
|
|
print(f"""
|
|
Approach 1 (title-case prefix → body): {len(approach1_hits):,} hits
|
|
Approach 2a (ALL CAPS start): {len(allcaps_hits):,} hits
|
|
Approach 2b (heading keyword start): {len(heading_start_hits):,} hits
|
|
Approach 3 (separator patterns): see above per-separator
|
|
Approach 5 (standalone short headings): {len(standalone_headings):,} hits
|
|
Approach 6a (gerund starts): {len(gerund_hits):,} hits
|
|
Approach 6b (heading noun starts): {len(noun_phrase_hits):,} hits
|
|
Approach 7 (numbered/lettered): see above per-pattern
|
|
Approach 8 (known phrase starts): {sum(len(v) for v in phrase_hits.values()):,} hits
|
|
""")
|
|
|
|
print("Done.")
|