From 99cf4a606c526a3d75d562b2eb2364fd05a9a721 Mon Sep 17 00:00:00 2001 From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com> Date: Sun, 29 Mar 2026 21:03:11 -0400 Subject: [PATCH] thread tokenization and chunking --- docs/STATUS.md | 111 +++++++++++++++++++++++++++++++++++++++ python/src/dapt/train.py | 62 +++++++++++++--------- 2 files changed, 148 insertions(+), 25 deletions(-) create mode 100644 docs/STATUS.md diff --git a/docs/STATUS.md b/docs/STATUS.md new file mode 100644 index 0000000..4a870d6 --- /dev/null +++ b/docs/STATUS.md @@ -0,0 +1,111 @@ +# Project Status — 2026-03-29 + +## What's Done + +### Data Pipeline +- [x] 72,045 paragraphs extracted from ~9,000 10-K filings + 207 8-K filings +- [x] 14 filing generators identified, quality metrics per generator +- [x] 6 surgical patches applied (orphan words + heading stripping) +- [x] Quality tier system: clean (80.7%), headed (10.3%), degraded (6.0%), minor (3.0%) +- [x] Embedded bullet detection (2,163 paragraphs flagged degraded, 0.5x sample weight) +- [x] All data integrity rules formalized (frozen originals, UUID-linked patches) + +### GenAI Labeling (Stage 1) +- [x] Prompt v2.5 locked after 12+ iterations +- [x] 3-model panel: gemini-flash-lite + mimo-v2-flash + grok-4.1-fast +- [x] 150,009 annotations completed ($115.88, 0 failures) +- [x] Orphan word re-annotation: 1,537 paragraphs re-run ($3.30), merged into `stage1.patched.jsonl` +- [x] Codebook v3.0 with 3 major rulings + +### DAPT Corpus +- [x] 14,568 documents, ~1.056B tokens, cleaned (XBRL, URLs, page numbers stripped) +- [x] Training pipeline verified end-to-end (PyTorch 2.10, CUDA, ModernBERT loads, tokenization works) +- [x] Config: 8192 seq_len, batch=1, grad_accum=32, 1 epoch, bf16 +- [x] Procedure documented in `docs/DAPT-PROCEDURE.md` + +### Documentation +- [x] `docs/DATA-QUALITY-AUDIT.md` — full audit with all patches and quality tiers +- [x] `docs/EDGAR-FILING-GENERATORS.md` — 14 generators with signatures and quality profiles +- [x] `docs/DAPT-PROCEDURE.md` — pre-flight checklist, commands, monitoring guide +- [x] `docs/NARRATIVE.md` — 11 phases documented through DAPT corpus prep + +## What's In Progress + +### DAPT Training (~4-8h) +```bash +cd python && bun run py:train dapt --config configs/dapt/modernbert.yaml +``` +No dependencies. Run anytime. + +### Human Labeling (139/1,200) +- 3 of 6 annotators started: 68 + 50 + 21 paragraphs completed +- Deployed via labelapp with quiz gating + warmup +- Each annotator needs 600 paragraphs (BIBD assignment) + +## What's Next (in dependency order) + +### 1. TAPT (~2-3h, blocked on DAPT) +Continue MLM on 72K Item 1C paragraphs using the DAPT checkpoint. +```bash +bun run py:train dapt --config configs/dapt/modernbert.yaml \ + --model-path ../checkpoints/dapt/modernbert-large/final \ + --data-path ../data/paragraphs/paragraphs-clean.patched.jsonl \ + --output-dir ../checkpoints/tapt/modernbert-large --stage tapt +``` + +### 2. Fine-tuning pipeline (no blockers — can build now) +Build the dual-head classifier (7-class category + 4-class specificity) with: +- Shared ModernBERT backbone + 2 linear classification heads +- Sample weighting from quality tiers (1.0 clean/headed/minor, 0.5 degraded) +- Confidence-stratified label assembly (unanimous → majority → judge) +- Train/val/test split with stratification +- Ablation configs: base vs +DAPT vs +DAPT+TAPT + +### 3. Judge prompt v3.0 update (no blockers — can do now) +Update `buildJudgePrompt()` with codebook v3.0 rulings: +- Materiality disclaimers → Strategy Integration +- SPACs → None/Other +- Person-vs-function test for Management↔RMP +Then re-bench against gold labels. + +### 4. Training data assembly (blocked on judge + human labels) +Combine all annotation sources into final training dataset: +- Unanimous Stage 1 labels (35,204 paragraphs, ~97% accuracy) +- Calibrated majority labels (~9-12K, ~85-90%) +- Judge high-confidence labels (~2-3K, ~84%) +- Judge low-confidence → downweight or exclude +- Quality tier sample weights applied + +### 5. Judge production run (blocked on human gold labels) +Run judge on ~409 unresolved + flagged majority cases. Validate against expanded gold set from human labels. + +### 6. Fine-tuning + ablations (blocked on steps 1-4) +7 experiments: {base, +DAPT, +DAPT+TAPT} × {with/without SCL} + best config. + +### 7. Evaluation + paper (blocked on everything above) +Full GenAI benchmark (9 models) on 1,200 holdout. Comparison tables. Write-up. + +## Parallel Tracks + +``` +Track A (GPU): DAPT ──→ TAPT ──→ Fine-tuning ──→ Eval + ↑ +Track B (API): Judge v3 → Judge run ───┤ + ↑ +Track C (Human): Labeling (139/1200) → Gold set validation + ↑ +Track D (Code): Fine-tune pipeline build ┘ +``` + +Tracks A and D can proceed now. Track B can start (prompt update) but production run waits for Track C. Everything converges at fine-tuning. + +## Key File Locations + +| What | Where | +|------|-------| +| Patched paragraphs | `data/paragraphs/training.patched.jsonl` (49,795) | +| Patched annotations | `data/annotations/stage1.patched.jsonl` (150,009) | +| Quality scores | `data/paragraphs/quality/quality-scores.jsonl` (72,045) | +| DAPT corpus | `data/dapt-corpus/shard-*.jsonl` (14,756 docs) | +| DAPT config | `python/configs/dapt/modernbert.yaml` | +| Training CLI | `python/main.py dapt --config ...` | diff --git a/python/src/dapt/train.py b/python/src/dapt/train.py index a11bcbb..c6d2dec 100644 --- a/python/src/dapt/train.py +++ b/python/src/dapt/train.py @@ -42,34 +42,46 @@ def train(config: DAPTConfig) -> None: ) print(f" Model parameters: {model.num_parameters() / 1e6:.0f}M") - # Load and prepare data - print(f" Loading corpus from {config.data.corpus_path}...") - dataset = load_corpus(config.data.corpus_path, config.data.text_field) - print(f" Raw documents: {len(dataset):,}") + # Load and prepare data (with disk cache to avoid re-tokenizing on resume) + output_dir = Path(config.training.output_dir) + cache_dir = output_dir / ".data_cache" + if cache_dir.exists(): + print(f" Loading cached dataset from {cache_dir}...") + from datasets import DatasetDict + split = DatasetDict.load_from_disk(str(cache_dir)) + print(f" Train: {len(split['train']):,} | Val: {len(split['test']):,}\n") + else: + print(f" Loading corpus from {config.data.corpus_path}...") + dataset = load_corpus(config.data.corpus_path, config.data.text_field) + print(f" Raw documents: {len(dataset):,}") - # Filter tiny documents (cover pages, empty filings) - min_chars = 10_000 - before = len(dataset) - dataset = dataset.filter(lambda x: len(x[config.data.text_field]) >= min_chars) - filtered = before - len(dataset) - if filtered > 0: - print(f" Filtered {filtered} docs < {min_chars:,} chars → {len(dataset):,} remaining") + # Filter tiny documents (cover pages, empty filings) + min_chars = 10_000 + before = len(dataset) + dataset = dataset.filter(lambda x: len(x[config.data.text_field]) >= min_chars) + filtered = before - len(dataset) + if filtered > 0: + print(f" Filtered {filtered} docs < {min_chars:,} chars → {len(dataset):,} remaining") - print(f" Tokenizing and chunking to {config.data.max_seq_length} tokens...") - chunked = tokenize_and_chunk( - dataset, - tokenizer, - text_field=config.data.text_field, - max_seq_length=config.data.max_seq_length, - ) - print(f" Training sequences: {len(chunked):,}") + print(f" Tokenizing and chunking to {config.data.max_seq_length} tokens...") + chunked = tokenize_and_chunk( + dataset, + tokenizer, + text_field=config.data.text_field, + max_seq_length=config.data.max_seq_length, + ) + print(f" Training sequences: {len(chunked):,}") - # Train/val split - split = chunked.train_test_split( - test_size=config.data.validation_split, - seed=config.training.seed, - ) - print(f" Train: {len(split['train']):,} | Val: {len(split['test']):,}\n") + # Train/val split + split = chunked.train_test_split( + test_size=config.data.validation_split, + seed=config.training.seed, + ) + print(f" Train: {len(split['train']):,} | Val: {len(split['test']):,}") + + # Cache to disk for fast resume + split.save_to_disk(str(cache_dir)) + print(f" Cached to {cache_dir}\n") # Data collator — handles dynamic masking each epoch collator = DataCollatorForLanguageModeling(