stage: tapt

model:
  name_or_path: ../checkpoints/dapt/modernbert-large/final
  trust_remote_code: false

data:
  corpus_path: ../data/paragraphs/paragraphs-clean.patched.jsonl
  text_field: text
  max_seq_length: 512  # 99.6% of paragraphs fit; mean=127, P99=386
  validation_split: 0.05  # larger val split — small dataset

training:
  output_dir: ../checkpoints/tapt/modernbert-large
  learning_rate: 5.0e-5
  mlm_probability: 0.30
  whole_word_mask: true
  num_train_epochs: 5
  per_device_train_batch_size: 32  # 22.7 GB peak w/ torch.compile at seq_len=512
  gradient_accumulation_steps: 1  # effective batch = 32 (matches DAPT)
  warmup_ratio: 0.05
  weight_decay: 1.0e-5
  bf16: true
  gradient_checkpointing: false  # short sequences, not needed
  logging_steps: 50
  save_strategy: epoch
  eval_strategy: epoch
  save_total_limit: 6  # keep all 5 epoch checkpoints + final
  dataloader_num_workers: 4
  seed: 42