540 lines
26 KiB
Python
540 lines
26 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Novel data quality audit for paragraphs-clean.jsonl.
|
||
READ-ONLY: prints findings to stdout, does not modify any files.
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import sys
|
||
from collections import Counter, defaultdict
|
||
from pathlib import Path
|
||
|
||
DATA_PATH = Path(__file__).resolve().parent.parent / "data" / "paragraphs" / "paragraphs-clean.jsonl"
|
||
|
||
# ── Cybersecurity domain keywords (broad) ──────────────────────────────
|
||
CYBER_KEYWORDS = {
|
||
"cyber", "cybersecurity", "security", "breach", "incident", "threat",
|
||
"vulnerability", "malware", "ransomware", "phishing", "firewall",
|
||
"encryption", "intrusion", "unauthorized", "attack", "hacker",
|
||
"data protection", "information security", "network security",
|
||
"access control", "authentication", "risk management", "ciso",
|
||
"chief information security", "chief information officer",
|
||
"information technology", "it systems", "data privacy", "privacy",
|
||
"personally identifiable", "pii", "soc", "nist", "iso 27001",
|
||
"penetration test", "disaster recovery", "business continuity",
|
||
"third party", "vendor", "supply chain", "cloud", "endpoint",
|
||
"monitoring", "detection", "response", "remediation", "patch",
|
||
"compliance", "regulatory", "safeguard", "protect", "secure",
|
||
"confidential", "integrity", "availability", "resilience",
|
||
"governance", "oversight", "board of directors", "audit committee",
|
||
"risk factor", "material", "disclosure", "1c", "item 1c",
|
||
}
|
||
|
||
# ── Non-cyber legal boilerplate patterns ────────────────────────────────
|
||
BOILERPLATE_PATTERNS = [
|
||
re.compile(r"forward[- ]looking\s+statements?", re.I),
|
||
re.compile(r"safe\s+harbor", re.I),
|
||
re.compile(r"private\s+securities\s+litigation\s+reform\s+act", re.I),
|
||
re.compile(r"cautionary\s+statement", re.I),
|
||
re.compile(r"except\s+as\s+required\s+by\s+law.*no\s+obligation\s+to\s+update", re.I),
|
||
re.compile(r"this\s+(annual\s+)?report\s+(on\s+form\s+10-k\s+)?contains?\s+forward", re.I),
|
||
]
|
||
|
||
# ── SEC item cross-reference pattern ────────────────────────────────────
|
||
SEC_ITEM_RE = re.compile(r"\bItem\s+(\d+[A-Z]?)\b", re.I)
|
||
|
||
# ── Dollar amount pattern ──────────────────────────────────────────────
|
||
DOLLAR_RE = re.compile(r"\$[\d,]+(?:\.\d+)?\s*(?:thousand|million|billion|trillion)?", re.I)
|
||
|
||
# ── Date patterns (unusual formats) ────────────────────────────────────
|
||
DATE_PATTERNS = [
|
||
# MM/DD/YYYY or MM-DD-YYYY
|
||
re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b"),
|
||
# Month DD, YYYY
|
||
re.compile(r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b", re.I),
|
||
# DD Month YYYY
|
||
re.compile(r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b", re.I),
|
||
# YYYY-MM-DD (ISO)
|
||
re.compile(r"\b\d{4}-\d{2}-\d{2}\b"),
|
||
]
|
||
|
||
# ── Bullet point characters ────────────────────────────────────────────
|
||
BULLET_RE = re.compile(r"[\u2022\u2023\u25E6\u2043\u2219\u25AA\u25AB\u25CF\u25CB\u25A0\u25A1]")
|
||
|
||
# ── Helpers ─────────────────────────────────────────────────────────────
|
||
def truncate(text: str, max_len: int = 200) -> str:
|
||
if len(text) <= max_len:
|
||
return text
|
||
return text[:max_len] + "..."
|
||
|
||
|
||
def print_section(title: str):
|
||
print(f"\n{'=' * 80}")
|
||
print(f" {title}")
|
||
print(f"{'=' * 80}")
|
||
|
||
|
||
def print_finding(name: str, concern: str, count: int, total: int, examples: list[dict]):
|
||
pct = count / total * 100 if total else 0
|
||
print(f"\n--- {name} [{concern} CONCERN] ---")
|
||
print(f" Count: {count:,} / {total:,} ({pct:.2f}%)")
|
||
for i, ex in enumerate(examples[:5]):
|
||
filing = ex.get("filing", {})
|
||
company = filing.get("companyName", "?")
|
||
print(f" Example {i+1} [{company}]:")
|
||
print(f" {truncate(ex['text'], 300)}")
|
||
if count > 5:
|
||
print(f" ... and {count - 5:,} more")
|
||
|
||
|
||
def has_cyber_relevance(text_lower: str) -> bool:
|
||
for kw in CYBER_KEYWORDS:
|
||
if kw in text_lower:
|
||
return True
|
||
return False
|
||
|
||
|
||
# ── Load data ──────────────────────────────────────────────────────────
|
||
def load_data():
|
||
paragraphs = []
|
||
with open(DATA_PATH) as f:
|
||
for line in f:
|
||
paragraphs.append(json.loads(line))
|
||
return paragraphs
|
||
|
||
|
||
def main():
|
||
print("Loading data...")
|
||
paragraphs = load_data()
|
||
total = len(paragraphs)
|
||
print(f"Loaded {total:,} paragraphs.\n")
|
||
|
||
# Pre-compute lowercase texts
|
||
texts_lower = [p["text"].lower() for p in paragraphs]
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("1. CHARACTER-LEVEL ANOMALIES")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
|
||
# 1a. High uppercase ratio (>30%)
|
||
high_upper = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
alpha = sum(1 for c in t if c.isalpha())
|
||
if alpha < 10:
|
||
continue
|
||
upper = sum(1 for c in t if c.isupper())
|
||
ratio = upper / alpha
|
||
if ratio > 0.30:
|
||
high_upper.append({**p, "_ratio": ratio})
|
||
high_upper.sort(key=lambda x: x["_ratio"], reverse=True)
|
||
print_finding("High uppercase ratio (>30% of alpha chars)", "MEDIUM",
|
||
len(high_upper), total, high_upper)
|
||
|
||
# 1b. Unusual punctuation density
|
||
high_punct = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
if len(t) < 30:
|
||
continue
|
||
semis = t.count(";")
|
||
colons = t.count(":")
|
||
dashes = t.count("—") + t.count("–") + t.count("-")
|
||
punct_count = semis + colons + dashes
|
||
density = punct_count / len(t)
|
||
if density > 0.05:
|
||
high_punct.append({**p, "_density": density, "_semis": semis, "_colons": colons, "_dashes": dashes})
|
||
high_punct.sort(key=lambda x: x["_density"], reverse=True)
|
||
print_finding("High punctuation density (semicolons/colons/dashes >5% of chars)", "LOW",
|
||
len(high_punct), total, high_punct)
|
||
|
||
# 1c. Non-ASCII characters
|
||
non_ascii_paras = []
|
||
non_ascii_chars_all = Counter()
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
non_ascii = [(c, hex(ord(c)), ord(c)) for c in t if ord(c) > 127]
|
||
if non_ascii:
|
||
chars_found = set((c, h) for c, h, _ in non_ascii)
|
||
for c, h, _ in non_ascii:
|
||
non_ascii_chars_all[f"{c} ({h})"] += 1
|
||
non_ascii_paras.append({**p, "_chars": chars_found})
|
||
print_finding("Paragraphs with non-ASCII characters", "MEDIUM",
|
||
len(non_ascii_paras), total, non_ascii_paras)
|
||
if non_ascii_chars_all:
|
||
print("\n Non-ASCII character frequency:")
|
||
for char_repr, cnt in non_ascii_chars_all.most_common(20):
|
||
print(f" {char_repr}: {cnt:,} occurrences")
|
||
|
||
# 1d. Unusual whitespace (multiple spaces, tabs)
|
||
multi_space_re = re.compile(r" +")
|
||
tab_re = re.compile(r"\t")
|
||
whitespace_issues = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
multi = len(multi_space_re.findall(t))
|
||
tabs = len(tab_re.findall(t))
|
||
if multi > 0 or tabs > 0:
|
||
whitespace_issues.append({**p, "_multi_spaces": multi, "_tabs": tabs})
|
||
print_finding("Unusual whitespace (multiple spaces or tabs)", "MEDIUM",
|
||
len(whitespace_issues), total, whitespace_issues)
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("2. CONTENT ANOMALIES")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
|
||
# 2a. Dollar amounts
|
||
dollar_paras = []
|
||
for p in paragraphs:
|
||
matches = DOLLAR_RE.findall(p["text"])
|
||
if matches:
|
||
dollar_paras.append({**p, "_amounts": matches})
|
||
print_finding("Paragraphs with dollar amounts", "MEDIUM",
|
||
len(dollar_paras), total, dollar_paras)
|
||
if dollar_paras:
|
||
# Show distribution of dollar amounts
|
||
all_amounts = []
|
||
for dp in dollar_paras:
|
||
all_amounts.extend(dp["_amounts"])
|
||
print(f"\n Total dollar amount mentions: {len(all_amounts):,}")
|
||
amount_counter = Counter(all_amounts)
|
||
print(" Most common amounts:")
|
||
for amt, cnt in amount_counter.most_common(10):
|
||
print(f" {amt}: {cnt:,}")
|
||
|
||
# 2b. Dates in text
|
||
date_paras = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
found_dates = []
|
||
for pat in DATE_PATTERNS:
|
||
found_dates.extend(pat.findall(t))
|
||
if found_dates:
|
||
date_paras.append({**p, "_dates": found_dates})
|
||
print_finding("Paragraphs containing dates", "LOW",
|
||
len(date_paras), total, date_paras)
|
||
if date_paras:
|
||
all_dates = []
|
||
for dp in date_paras:
|
||
all_dates.extend(dp["_dates"])
|
||
print(f"\n Total date mentions: {len(all_dates):,}")
|
||
|
||
# 2c. Cross-references to other SEC items
|
||
cross_ref_paras = []
|
||
for p in paragraphs:
|
||
matches = SEC_ITEM_RE.findall(p["text"])
|
||
# Filter out Item 1C (that's expected)
|
||
other_items = [m for m in matches if m.upper() != "1C"]
|
||
if other_items:
|
||
cross_ref_paras.append({**p, "_items": other_items})
|
||
# Count which items are referenced
|
||
item_counts = Counter()
|
||
for crp in cross_ref_paras:
|
||
for item in crp["_items"]:
|
||
item_counts[f"Item {item}"] += 1
|
||
print_finding("Cross-references to non-1C SEC items", "HIGH",
|
||
len(cross_ref_paras), total, cross_ref_paras)
|
||
if item_counts:
|
||
print("\n Referenced items:")
|
||
for item, cnt in item_counts.most_common():
|
||
print(f" {item}: {cnt:,}")
|
||
|
||
# 2d. Non-cyber legal boilerplate
|
||
boilerplate_paras = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
matched = []
|
||
for pat in BOILERPLATE_PATTERNS:
|
||
if pat.search(t):
|
||
matched.append(pat.pattern[:60])
|
||
if matched:
|
||
boilerplate_paras.append({**p, "_patterns": matched})
|
||
print_finding("Non-cybersecurity legal boilerplate", "HIGH",
|
||
len(boilerplate_paras), total, boilerplate_paras)
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("3. STRUCTURAL ANOMALIES")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
|
||
# 3a. Bullet points mid-text
|
||
bullet_paras = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
if BULLET_RE.search(t):
|
||
bullet_paras.append(p)
|
||
elif re.search(r"(?:^|\n)\s*[-*]\s+\w", t):
|
||
bullet_paras.append(p)
|
||
print_finding("Paragraphs with bullet points mid-text", "MEDIUM",
|
||
len(bullet_paras), total, bullet_paras)
|
||
|
||
# 3b. Embedded newlines
|
||
newline_paras = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
nl_count = t.count("\n")
|
||
if nl_count > 0:
|
||
newline_paras.append({**p, "_newlines": nl_count})
|
||
newline_paras.sort(key=lambda x: x["_newlines"], reverse=True)
|
||
print_finding("Paragraphs with embedded newlines", "MEDIUM",
|
||
len(newline_paras), total, newline_paras)
|
||
|
||
# 3c. Mid-paragraph headings (ALL CAPS phrase of 3+ words followed by different content)
|
||
mid_heading_re = re.compile(r"(?<=\. )([A-Z][A-Z\s]{10,}[A-Z])(?=\.?\s+[A-Z][a-z])")
|
||
mid_heading_paras = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
matches = mid_heading_re.findall(t)
|
||
if matches:
|
||
mid_heading_paras.append({**p, "_headings": matches})
|
||
print_finding("Mid-paragraph headings (ALL CAPS phrase mid-sentence)", "MEDIUM",
|
||
len(mid_heading_paras), total, mid_heading_paras)
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("4. OUTLIER DETECTION")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
|
||
# 4a. Extremely high word count (>400)
|
||
long_paras = [p for p in paragraphs if p["wordCount"] > 400]
|
||
long_paras.sort(key=lambda x: x["wordCount"], reverse=True)
|
||
print_finding("Extremely long paragraphs (>400 words)", "HIGH",
|
||
len(long_paras), total, long_paras)
|
||
if long_paras:
|
||
wc_values = [p["wordCount"] for p in long_paras]
|
||
print(f"\n Word count range: {min(wc_values)} - {max(wc_values)}")
|
||
print(f" Mean: {sum(wc_values)/len(wc_values):.0f}")
|
||
|
||
# 4b. Low information density
|
||
# Common English stopwords
|
||
STOPWORDS = {
|
||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
||
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
|
||
"being", "have", "has", "had", "do", "does", "did", "will", "would",
|
||
"could", "should", "may", "might", "shall", "can", "that", "which",
|
||
"who", "whom", "this", "these", "those", "it", "its", "we", "our",
|
||
"us", "they", "their", "them", "he", "she", "his", "her", "as",
|
||
"if", "not", "no", "nor", "so", "than", "too", "very", "such",
|
||
"also", "each", "any", "all", "both", "other", "some", "into",
|
||
"through", "during", "before", "after", "about", "between", "under",
|
||
"over", "above", "up", "down", "out", "off", "then", "once",
|
||
}
|
||
low_info_paras = []
|
||
for p in paragraphs:
|
||
words = re.findall(r"[a-z]+", p["text"].lower())
|
||
if len(words) < 20:
|
||
continue
|
||
stop_ratio = sum(1 for w in words if w in STOPWORDS) / len(words)
|
||
if stop_ratio > 0.65:
|
||
low_info_paras.append({**p, "_stop_ratio": stop_ratio})
|
||
low_info_paras.sort(key=lambda x: x["_stop_ratio"], reverse=True)
|
||
print_finding("Low information density (>65% stopwords)", "LOW",
|
||
len(low_info_paras), total, low_info_paras)
|
||
|
||
# 4c. Exact substring matches across filings
|
||
print("\n--- Exact substring matches across filings [HIGH CONCERN] ---")
|
||
print(" (Checking paragraphs that appear as substrings of others in different filings...)")
|
||
# Group by accession number for efficiency
|
||
by_accession = defaultdict(list)
|
||
for p in paragraphs:
|
||
acc = p["filing"]["accessionNumber"]
|
||
by_accession[acc].append(p)
|
||
|
||
# For efficiency, only check paragraphs 50-200 chars (likely fragments/duplicates)
|
||
# Sort by length so shorter ones are checked as substrings of longer ones
|
||
candidates = [(p["text"], p["filing"]["accessionNumber"], p["filing"]["companyName"], p["id"])
|
||
for p in paragraphs if 50 <= len(p["text"]) <= 200]
|
||
longer_texts = [(p["text"], p["filing"]["accessionNumber"], p["filing"]["companyName"])
|
||
for p in paragraphs if len(p["text"]) > 200]
|
||
|
||
substring_matches = []
|
||
# Use a set for dedup
|
||
seen = set()
|
||
# Only check a sample for performance
|
||
check_limit = min(len(candidates), 3000)
|
||
for i in range(check_limit):
|
||
cand_text, cand_acc, cand_co, cand_id = candidates[i]
|
||
for long_text, long_acc, long_co in longer_texts[:5000]:
|
||
if cand_acc == long_acc:
|
||
continue # same filing, skip
|
||
if cand_text in long_text and cand_id not in seen:
|
||
seen.add(cand_id)
|
||
substring_matches.append({
|
||
"text": cand_text,
|
||
"filing": {"companyName": cand_co, "accessionNumber": cand_acc},
|
||
"_found_in": long_co,
|
||
})
|
||
break
|
||
print(f" Count (sampled {check_limit:,} short paras against {min(len(longer_texts), 5000):,} long paras): {len(substring_matches):,}")
|
||
for i, ex in enumerate(substring_matches[:5]):
|
||
print(f" Example {i+1} [{ex['filing']['companyName']}] (also in {ex['_found_in']}):")
|
||
print(f" {truncate(ex['text'], 300)}")
|
||
if len(substring_matches) > 5:
|
||
print(f" ... and {len(substring_matches) - 5:,} more")
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("5. SEMANTIC COHERENCE")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
|
||
# 5a. Company name mismatch — look for SPECIFIC named companies in text
|
||
# that differ from the filing company. Filter out generic refs like "the Company".
|
||
company_name_mismatches = []
|
||
# Pattern: proper noun(s) + legal suffix at end, NOT preceded by "the "
|
||
specific_company_re = re.compile(
|
||
r"(?<!\bthe )(?<!\bThe )(?<!\ba )(?<!\bA )"
|
||
r"\b([A-Z][A-Za-z&\.']+(?:\s+[A-Z][A-Za-z&\.']+){0,5})"
|
||
r",?\s+(Corp(?:oration)?|Inc(?:orporated)?|LLC|Ltd|L\.P\.|Holdings|Partners)\b\.?"
|
||
)
|
||
# Generic phrases to ignore
|
||
GENERIC_COMPANY_REFS = {
|
||
"the company", "our company", "a company", "each company",
|
||
"any company", "this company", "such company", "parent company",
|
||
"holding company", "shell company", "blank check company",
|
||
"portfolio company", "operating company", "management company",
|
||
"insurance company", "affiliated company",
|
||
}
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
filing_company = p["filing"]["companyName"]
|
||
matches = specific_company_re.findall(t)
|
||
if not matches:
|
||
continue
|
||
filing_words = set(w.lower() for w in re.findall(r"[A-Za-z]{3,}", filing_company))
|
||
for name_part, suffix in matches:
|
||
full = f"{name_part} {suffix}".strip()
|
||
if full.lower() in GENERIC_COMPANY_REFS:
|
||
continue
|
||
mention_words = set(w.lower() for w in re.findall(r"[A-Za-z]{3,}", full))
|
||
generic = {"inc", "corp", "corporation", "incorporated", "company", "group",
|
||
"holdings", "the", "and", "llc", "ltd", "partners", "new"}
|
||
meaningful_filing = filing_words - generic
|
||
meaningful_mention = mention_words - generic
|
||
if meaningful_mention and not (meaningful_mention & meaningful_filing):
|
||
company_name_mismatches.append({
|
||
**p,
|
||
"_mentioned": full,
|
||
"_filing_company": filing_company,
|
||
})
|
||
break
|
||
print_finding("Company name in text doesn't match filing metadata", "HIGH",
|
||
len(company_name_mismatches), total, company_name_mismatches)
|
||
if company_name_mismatches:
|
||
print("\n Sample mismatches (mentioned vs filing):")
|
||
for ex in company_name_mismatches[:15]:
|
||
print(f" Mentioned: '{ex['_mentioned']}' | Filing: '{ex['_filing_company']}'")
|
||
|
||
# 5b. No cybersecurity keywords at all
|
||
no_cyber = []
|
||
for i, p in enumerate(paragraphs):
|
||
if not has_cyber_relevance(texts_lower[i]):
|
||
no_cyber.append(p)
|
||
print_finding("No cybersecurity keywords at all", "HIGH",
|
||
len(no_cyber), total, no_cyber)
|
||
if no_cyber:
|
||
# Show word count distribution of non-cyber paragraphs
|
||
wc_dist = Counter()
|
||
for p in no_cyber:
|
||
bucket = (p["wordCount"] // 50) * 50
|
||
wc_dist[f"{bucket}-{bucket+49}"] += 1
|
||
print("\n Word count distribution of non-cyber paragraphs:")
|
||
for bucket, cnt in sorted(wc_dist.items()):
|
||
print(f" {bucket} words: {cnt:,}")
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("BONUS: ADDITIONAL NOVEL CHECKS")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
|
||
# 6a. Paragraphs that are mostly a URL or contain URLs
|
||
url_re = re.compile(r"https?://\S+|www\.\S+")
|
||
url_paras = []
|
||
for p in paragraphs:
|
||
urls = url_re.findall(p["text"])
|
||
if urls:
|
||
url_ratio = sum(len(u) for u in urls) / len(p["text"])
|
||
url_paras.append({**p, "_urls": urls, "_ratio": url_ratio})
|
||
url_paras.sort(key=lambda x: x["_ratio"], reverse=True)
|
||
print_finding("Paragraphs containing URLs", "MEDIUM",
|
||
len(url_paras), total, url_paras)
|
||
|
||
# 6b. Paragraphs with parenthetical references that look like citations/footnotes
|
||
footnote_re = re.compile(r"\(\d+\)|\[\d+\]|(?:footnote|fn\.?)\s*\d+", re.I)
|
||
footnote_paras = []
|
||
for p in paragraphs:
|
||
if footnote_re.search(p["text"]):
|
||
footnote_paras.append(p)
|
||
print_finding("Paragraphs with footnote/citation references", "LOW",
|
||
len(footnote_paras), total, footnote_paras)
|
||
|
||
# 6c. Paragraphs that look like table data (multiple numeric values separated by whitespace)
|
||
table_re = re.compile(r"(?:\d[\d,.]*\s+){3,}")
|
||
table_paras = []
|
||
for p in paragraphs:
|
||
if table_re.search(p["text"]):
|
||
table_paras.append(p)
|
||
print_finding("Paragraphs that look like table/numeric data", "HIGH",
|
||
len(table_paras), total, table_paras)
|
||
|
||
# 6d. Encoding artifacts (replacement chars, zero-width spaces, BOM, etc.)
|
||
encoding_re = re.compile(r"[\ufffd\u200b\u200c\u200d\ufeff\u00a0]")
|
||
encoding_paras = []
|
||
for p in paragraphs:
|
||
matches = encoding_re.findall(p["text"])
|
||
if matches:
|
||
encoding_paras.append({**p, "_artifacts": Counter(f"U+{ord(c):04X} ({c!r})" for c in matches)})
|
||
print_finding("Encoding artifacts (replacement chars, NBSP, zero-width, BOM)", "HIGH",
|
||
len(encoding_paras), total, encoding_paras)
|
||
if encoding_paras:
|
||
all_artifacts = Counter()
|
||
for ep in encoding_paras:
|
||
all_artifacts.update(ep["_artifacts"])
|
||
print("\n Artifact frequency:")
|
||
for art, cnt in all_artifacts.most_common():
|
||
print(f" {art}: {cnt:,}")
|
||
|
||
# 6e. Repeated sentences within a paragraph
|
||
repeated_sent_paras = []
|
||
for p in paragraphs:
|
||
t = p["text"]
|
||
# Split on sentence boundaries
|
||
sentences = re.split(r'(?<=[.!?])\s+', t)
|
||
if len(sentences) < 3:
|
||
continue
|
||
sent_counter = Counter(s.strip().lower() for s in sentences if len(s.strip()) > 20)
|
||
dupes = {s: c for s, c in sent_counter.items() if c > 1}
|
||
if dupes:
|
||
repeated_sent_paras.append({**p, "_dupes": dupes})
|
||
print_finding("Paragraphs with repeated sentences", "HIGH",
|
||
len(repeated_sent_paras), total, repeated_sent_paras)
|
||
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print_section("SUMMARY")
|
||
# ════════════════════════════════════════════════════════════════════
|
||
print(f"\n Total paragraphs analyzed: {total:,}")
|
||
print(f"\n HIGH concern findings:")
|
||
print(f" - Cross-references to non-1C items: {len(cross_ref_paras):,}")
|
||
print(f" - Non-cyber legal boilerplate: {len(boilerplate_paras):,}")
|
||
print(f" - Extremely long paragraphs (>400 words): {len(long_paras):,}")
|
||
print(f" - Company name mismatches: {len(company_name_mismatches):,}")
|
||
print(f" - No cybersecurity keywords: {len(no_cyber):,}")
|
||
print(f" - Table/numeric data: {len(table_paras):,}")
|
||
print(f" - Encoding artifacts: {len(encoding_paras):,}")
|
||
print(f" - Repeated sentences: {len(repeated_sent_paras):,}")
|
||
print(f" - Exact substring matches (sampled): {len(substring_matches):,}")
|
||
print(f"\n MEDIUM concern findings:")
|
||
print(f" - High uppercase ratio: {len(high_upper):,}")
|
||
print(f" - Non-ASCII characters: {len(non_ascii_paras):,}")
|
||
print(f" - Unusual whitespace: {len(whitespace_issues):,}")
|
||
print(f" - Dollar amounts: {len(dollar_paras):,}")
|
||
print(f" - Bullet points mid-text: {len(bullet_paras):,}")
|
||
print(f" - Embedded newlines: {len(newline_paras):,}")
|
||
print(f" - Mid-paragraph headings: {len(mid_heading_paras):,}")
|
||
print(f" - URLs in text: {len(url_paras):,}")
|
||
print(f"\n LOW concern findings:")
|
||
print(f" - High punctuation density: {len(high_punct):,}")
|
||
print(f" - Date mentions: {len(date_paras):,}")
|
||
print(f" - Low information density: {len(low_info_paras):,}")
|
||
print(f" - Footnote references: {len(footnote_paras):,}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|