stage: tapt model: name_or_path: ../checkpoints/dapt/modernbert-large/final trust_remote_code: false data: corpus_path: ../data/paragraphs/paragraphs-clean.patched.jsonl text_field: text max_seq_length: 512 # 99.6% of paragraphs fit; mean=127, P99=386 validation_split: 0.05 # larger val split — small dataset training: output_dir: ../checkpoints/tapt/modernbert-large learning_rate: 5.0e-5 mlm_probability: 0.30 whole_word_mask: true num_train_epochs: 5 per_device_train_batch_size: 32 # 22.7 GB peak w/ torch.compile at seq_len=512 gradient_accumulation_steps: 1 # effective batch = 32 (matches DAPT) warmup_ratio: 0.05 weight_decay: 1.0e-5 bf16: true gradient_checkpointing: false # short sequences, not needed logging_steps: 50 save_strategy: epoch eval_strategy: epoch save_total_limit: 6 # keep all 5 epoch checkpoints + final dataloader_num_workers: 4 seed: 42