diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..e9a4e19 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,5 @@ +{ + "enabledPlugins": { + "typescript-lsp@claude-plugins-official": false + } +} diff --git a/.gitignore b/.gitignore index f901b59..be6846f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Data (too large for git) data/ models/ +checkpoints/ # Dependencies ts/node_modules/ diff --git a/docs/TECHNICAL-GUIDE.md b/docs/TECHNICAL-GUIDE.md index ef11b4c..6d75309 100644 --- a/docs/TECHNICAL-GUIDE.md +++ b/docs/TECHNICAL-GUIDE.md @@ -211,7 +211,7 @@ const result = await generateObject({ Continue MLM pre-training on SEC filing text to create "SEC-ModernBERT-large": - **Training corpus:** ~450M tokens from our own 9,000 cached 10-K filings (FY2023-2024), full filing text (not just Item 1C). These are the same filings we extracted Item 1C from — we already have the raw HTML cached locally and the cleaning pipeline built (`stripHtml()` in `fast-reparse.ts`). - **Why not PleIAs/SEC?** The PleIAs/SEC dataset (373K filings, ~18B tokens) goes back much further in time, but: (a) one pass would take weeks on a single 3090, (b) Item 1C didn't exist before FY2023 so pre-2023 filings lack the cybersecurity disclosure vocabulary that matters most for our task, (c) the SEC filing scaling laws paper (arXiv:2512.12384) shows the largest gains come in the first 200M tokens — our 450M from recent, relevant filings is already in the sweet spot. -- **Corpus preparation:** `dapt-corpus.ts` runs `stripHtml()` on cached filing HTML (full text, no section extraction) and outputs clean text as sharded JSONL. Same HTML cleaning that handles XBRL tags, entity decoding, page artifacts, inline element boundaries — just without the Item 1C section boundary step. +- **Corpus preparation:** `bun run ts/scripts/dapt-corpus-prep.ts` runs `stripHtml()` (from `ts/src/extract/html-cleaner.ts`) on all cached filing HTML (full text, no section extraction) and outputs clean text as sharded JSONL to `data/dapt-corpus/`. Same HTML cleaning that handles XBRL tags, entity decoding, page artifacts, inline element boundaries — just without the Item 1C section boundary step. - **MLM objective:** 30% masking rate (ModernBERT convention) - **Learning rate:** ~5e-5 (search range: 1e-5 to 1e-4) - **Hardware (RTX 3090):** bf16, gradient checkpointing, seq_len=1024-2048, batch_size=2-4 + gradient accumulation to effective batch 16-32 @@ -335,6 +335,58 @@ Decoder (Unsloth LoRA): - lora_alpha: [16, 32, 64] - learning_rate: [1e-4, 2e-4, 5e-4] +### 3.8 Python Training Package (`python/`) + +Structured Python package for all training stages. All commands run from `python/` via `uv run main.py `. + +**Package layout:** + +``` +python/ +├── main.py # CLI: uv run main.py {dapt,finetune,eval,decoder} +├── pyproject.toml # Dependencies (torch, transformers, datasets, accelerate) +├── configs/ +│ ├── dapt/ +│ │ ├── modernbert.yaml # ModernBERT DAPT hyperparams +│ │ └── neobert.yaml # NeoBERT DAPT hyperparams +│ ├── finetune/ # Fine-tuning configs (per ablation) +│ └── decoder/ # Qwen LoRA config +└── src/ + ├── common/ + │ └── config.py # Typed dataclass configs, YAML loading + ├── data/ + │ └── corpus.py # DAPT corpus loading, tokenization, chunking + ├── dapt/ + │ └── train.py # DAPT + TAPT (same MLM objective, different data) + ├── finetune/ + │ ├── model.py # Dual-head classifier architecture + │ └── train.py # Classification fine-tuning + ├── eval/ + │ └── metrics.py # Macro-F1, MCC, Krippendorff's Alpha + └── decoder/ + └── train.py # Qwen LoRA experiment (Unsloth) +``` + +**DAPT/TAPT usage:** + +```bash +# DAPT: full 10-K filings → SEC-ModernBERT-large +uv run main.py dapt --config configs/dapt/modernbert.yaml + +# TAPT: continue from DAPT checkpoint on Item 1C paragraphs → SEC-cyBERT-large +uv run main.py dapt --config configs/dapt/modernbert.yaml \ + --model-path ../checkpoints/dapt/modernbert-large/final \ + --data-path ../data/paragraphs/paragraphs-clean.jsonl \ + --output-dir ../checkpoints/tapt/modernbert-large \ + --stage tapt +``` + +**Config design:** YAML files define all hyperparameters (reproducible, diffable). CLI flags override key fields (`--model-path`, `--data-path`, `--output-dir`, `--stage`) for TAPT or experimentation without duplicating config files. + +**Corpus preparation (prerequisite):** Run `bun run ts/scripts/dapt-corpus-prep.ts` from repo root to generate `data/dapt-corpus/` shards from cached HTML. This reuses the same `stripHtml()` from `ts/src/extract/html-cleaner.ts` that powers paragraph extraction. + +**Checkpoints:** Saved to `checkpoints/` (gitignored). Each stage writes to `{output_dir}/final/` with the model and tokenizer. + --- ## 4. Evaluation & Validation diff --git a/python/configs/dapt/modernbert.yaml b/python/configs/dapt/modernbert.yaml new file mode 100644 index 0000000..b577e51 --- /dev/null +++ b/python/configs/dapt/modernbert.yaml @@ -0,0 +1,29 @@ +stage: dapt + +model: + name_or_path: answerdotai/ModernBERT-large + trust_remote_code: false + +data: + corpus_path: ../data/dapt-corpus + text_field: text + max_seq_length: 2048 + validation_split: 0.02 + +training: + output_dir: ../checkpoints/dapt/modernbert-large + learning_rate: 5.0e-5 + mlm_probability: 0.30 + num_train_epochs: 1 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 8 # effective batch = 32 + warmup_ratio: 0.05 + weight_decay: 0.01 + bf16: true + gradient_checkpointing: true + logging_steps: 50 + save_steps: 1000 + eval_steps: 1000 + save_total_limit: 3 + dataloader_num_workers: 4 + seed: 42 diff --git a/python/configs/dapt/neobert.yaml b/python/configs/dapt/neobert.yaml new file mode 100644 index 0000000..d5ed5a4 --- /dev/null +++ b/python/configs/dapt/neobert.yaml @@ -0,0 +1,29 @@ +stage: dapt + +model: + name_or_path: chandar-lab/NeoBERT + trust_remote_code: true + +data: + corpus_path: ../data/dapt-corpus + text_field: text + max_seq_length: 2048 # NeoBERT supports up to 4096 + validation_split: 0.02 + +training: + output_dir: ../checkpoints/dapt/neobert + learning_rate: 5.0e-5 + mlm_probability: 0.20 # NeoBERT was pre-trained with 20% masking + num_train_epochs: 1 + per_device_train_batch_size: 6 # smaller model, can fit more per batch + gradient_accumulation_steps: 5 # effective batch = 30 + warmup_ratio: 0.05 + weight_decay: 0.01 + bf16: true + gradient_checkpointing: true + logging_steps: 50 + save_steps: 1000 + eval_steps: 1000 + save_total_limit: 3 + dataloader_num_workers: 4 + seed: 42 diff --git a/python/main.py b/python/main.py index 7162f9a..fb74cbf 100644 --- a/python/main.py +++ b/python/main.py @@ -1,5 +1,68 @@ -def main(): - print("Hello from sec-cybert-train!") +"""SEC-cyBERT training CLI. + +Usage: + uv run main.py dapt --config configs/dapt/modernbert.yaml + uv run main.py dapt --config configs/dapt/modernbert.yaml \\ + --model-path ../checkpoints/dapt/modernbert-large/final \\ + --data-path ../data/paragraphs/paragraphs-clean.jsonl \\ + --output-dir ../checkpoints/tapt/modernbert-large \\ + --stage tapt +""" + +import argparse +import sys + + +def cmd_dapt(args: argparse.Namespace) -> None: + from src.common.config import DAPTConfig + from src.dapt.train import train + + config = DAPTConfig.from_yaml(args.config) + config.apply_overrides( + model_path=args.model_path, + data_path=args.data_path, + output_dir=args.output_dir, + stage=args.stage, + ) + train(config) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="SEC-cyBERT training pipeline", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + sub = parser.add_subparsers(dest="command", required=True) + + # ── dapt / tapt ── + dapt = sub.add_parser( + "dapt", + help="Run DAPT or TAPT pre-training (masked language modeling)", + ) + dapt.add_argument("--config", required=True, help="Path to YAML config file") + dapt.add_argument("--model-path", help="Override model name or checkpoint path") + dapt.add_argument("--data-path", help="Override corpus path (file or directory)") + dapt.add_argument("--output-dir", help="Override output directory") + dapt.add_argument("--stage", choices=["dapt", "tapt"], help="Override stage label") + dapt.set_defaults(func=cmd_dapt) + + # ── finetune (placeholder) ── + ft = sub.add_parser("finetune", help="Fine-tune classifier (dual-head)") + ft.add_argument("--config", required=True, help="Path to YAML config file") + ft.set_defaults(func=lambda args: print("Fine-tuning not yet implemented.")) + + # ── eval (placeholder) ── + ev = sub.add_parser("eval", help="Evaluate a trained model") + ev.add_argument("--config", required=True, help="Path to YAML config file") + ev.set_defaults(func=lambda args: print("Evaluation not yet implemented.")) + + # ── decoder (placeholder) ── + dec = sub.add_parser("decoder", help="Decoder experiment (Qwen LoRA)") + dec.add_argument("--config", required=True, help="Path to YAML config file") + dec.set_defaults(func=lambda args: print("Decoder experiment not yet implemented.")) + + args = parser.parse_args() + args.func(args) if __name__ == "__main__": diff --git a/python/pyproject.toml b/python/pyproject.toml index c97cbd8..ea120cc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,20 @@ [project] name = "sec-cybert-train" version = "0.1.0" -description = "Add your description here" +description = "SEC-cyBERT training pipeline: DAPT, TAPT, fine-tuning, and evaluation" readme = "README.md" requires-python = ">=3.13" -dependencies = [] +dependencies = [ + "torch", + "transformers", + "datasets", + "accelerate", + "pyyaml", +] + +[project.optional-dependencies] +flash = ["flash-attn"] +decoder = ["unsloth"] + +[project.scripts] +sec-cybert = "main:main" diff --git a/python/src/__init__.py b/python/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/common/__init__.py b/python/src/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/common/config.py b/python/src/common/config.py new file mode 100644 index 0000000..894039e --- /dev/null +++ b/python/src/common/config.py @@ -0,0 +1,87 @@ +"""Typed configuration for all training stages, loaded from YAML.""" + +from dataclasses import dataclass, field, fields +from pathlib import Path + +import yaml + + +@dataclass +class ModelConfig: + """Which pretrained model to load.""" + + name_or_path: str + tokenizer: str | None = None # defaults to name_or_path if None + trust_remote_code: bool = False + + +@dataclass +class DAPTDataConfig: + """Data paths and preprocessing for DAPT/TAPT.""" + + corpus_path: str # directory of JSONL shards or single JSONL file + text_field: str = "text" + max_seq_length: int = 2048 + validation_split: float = 0.02 + + +@dataclass +class TrainingConfig: + """HuggingFace Trainer arguments.""" + + output_dir: str + learning_rate: float = 5e-5 + mlm_probability: float = 0.30 + num_train_epochs: int = 1 + per_device_train_batch_size: int = 4 + gradient_accumulation_steps: int = 8 + warmup_ratio: float = 0.05 + weight_decay: float = 0.01 + bf16: bool = True + gradient_checkpointing: bool = True + logging_steps: int = 50 + save_steps: int = 1000 + eval_steps: int = 1000 + save_total_limit: int = 3 + dataloader_num_workers: int = 4 + seed: int = 42 + resume_from_checkpoint: str | None = None + + +@dataclass +class DAPTConfig: + """Full configuration for a DAPT or TAPT run.""" + + model: ModelConfig + data: DAPTDataConfig + training: TrainingConfig + stage: str = "dapt" # "dapt" or "tapt" — informational label + + @classmethod + def from_yaml(cls, path: str | Path) -> "DAPTConfig": + with open(path) as f: + raw = yaml.safe_load(f) + return cls( + model=ModelConfig(**raw["model"]), + data=DAPTDataConfig(**raw["data"]), + training=TrainingConfig(**raw["training"]), + stage=raw.get("stage", "dapt"), + ) + + def apply_overrides( + self, + *, + model_path: str | None = None, + data_path: str | None = None, + output_dir: str | None = None, + stage: str | None = None, + ) -> None: + """Apply CLI overrides on top of YAML config.""" + if model_path is not None: + self.model.name_or_path = model_path + if data_path is not None: + self.data.corpus_path = data_path + if output_dir is not None: + self.training.output_dir = output_dir + if stage is not None: + self.stage = stage diff --git a/python/src/dapt/__init__.py b/python/src/dapt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/dapt/train.py b/python/src/dapt/train.py new file mode 100644 index 0000000..61f6126 --- /dev/null +++ b/python/src/dapt/train.py @@ -0,0 +1,118 @@ +"""DAPT and TAPT training via HuggingFace Trainer. + +Both stages use the same masked language modeling objective — the only +difference is the corpus (full filings for DAPT, Item 1C paragraphs for TAPT) +and the starting checkpoint (base model for DAPT, DAPT checkpoint for TAPT). +""" + +from pathlib import Path + +from transformers import ( + AutoModelForMaskedLM, + AutoTokenizer, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from ..common.config import DAPTConfig +from ..data.corpus import load_corpus, tokenize_and_chunk + + +def train(config: DAPTConfig) -> None: + """Run DAPT or TAPT training from a config.""" + print(f"\n{'='*60}") + print(f" SEC-cyBERT {config.stage.upper()} Training") + print(f" Model: {config.model.name_or_path}") + print(f" Data: {config.data.corpus_path}") + print(f" Output: {config.training.output_dir}") + print(f"{'='*60}\n") + + # Load tokenizer + tokenizer_name = config.model.tokenizer or config.model.name_or_path + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=config.model.trust_remote_code, + ) + + # Load model + model = AutoModelForMaskedLM.from_pretrained( + config.model.name_or_path, + trust_remote_code=config.model.trust_remote_code, + ) + print(f" Model parameters: {model.num_parameters() / 1e6:.0f}M") + + # Load and prepare data + print(f" Loading corpus from {config.data.corpus_path}...") + dataset = load_corpus(config.data.corpus_path, config.data.text_field) + print(f" Raw documents: {len(dataset):,}") + + print(f" Tokenizing and chunking to {config.data.max_seq_length} tokens...") + chunked = tokenize_and_chunk( + dataset, + tokenizer, + text_field=config.data.text_field, + max_seq_length=config.data.max_seq_length, + ) + print(f" Training sequences: {len(chunked):,}") + + # Train/val split + split = chunked.train_test_split( + test_size=config.data.validation_split, + seed=config.training.seed, + ) + print(f" Train: {len(split['train']):,} | Val: {len(split['test']):,}\n") + + # Data collator — handles dynamic masking each epoch + collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm=True, + mlm_probability=config.training.mlm_probability, + ) + + # Training arguments + output_dir = Path(config.training.output_dir) + args = TrainingArguments( + output_dir=str(output_dir), + learning_rate=config.training.learning_rate, + num_train_epochs=config.training.num_train_epochs, + per_device_train_batch_size=config.training.per_device_train_batch_size, + gradient_accumulation_steps=config.training.gradient_accumulation_steps, + warmup_ratio=config.training.warmup_ratio, + weight_decay=config.training.weight_decay, + bf16=config.training.bf16, + gradient_checkpointing=config.training.gradient_checkpointing, + logging_steps=config.training.logging_steps, + save_steps=config.training.save_steps, + eval_strategy="steps", + eval_steps=config.training.eval_steps, + save_total_limit=config.training.save_total_limit, + dataloader_num_workers=config.training.dataloader_num_workers, + seed=config.training.seed, + report_to="none", + load_best_model_at_end=True, + metric_for_best_model="eval_loss", + ) + + trainer = Trainer( + model=model, + args=args, + train_dataset=split["train"], + eval_dataset=split["test"], + data_collator=collator, + ) + + # Train (with optional checkpoint resume) + trainer.train(resume_from_checkpoint=config.training.resume_from_checkpoint) + + # Save final model + tokenizer + final_dir = output_dir / "final" + print(f"\n Saving final model to {final_dir}...") + trainer.save_model(str(final_dir)) + tokenizer.save_pretrained(str(final_dir)) + + # Log final eval + metrics = trainer.evaluate() + print(f"\n Final eval loss: {metrics['eval_loss']:.4f}") + print(f" Final perplexity: {2 ** metrics['eval_loss']:.2f}") + print(f"\n {config.stage.upper()} training complete.") diff --git a/python/src/decoder/__init__.py b/python/src/decoder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/eval/__init__.py b/python/src/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/src/finetune/__init__.py b/python/src/finetune/__init__.py new file mode 100644 index 0000000..e69de29