stage: dapt

model:
  name_or_path: chandar-lab/NeoBERT
  trust_remote_code: true

data:
  corpus_path: ../data/dapt-corpus
  text_field: text
  max_seq_length: 2048  # NeoBERT supports up to 4096
  validation_split: 0.02

training:
  output_dir: ../checkpoints/dapt/neobert
  learning_rate: 5.0e-5
  mlm_probability: 0.20  # NeoBERT was pre-trained with 20% masking
  num_train_epochs: 1
  per_device_train_batch_size: 6  # smaller model, can fit more per batch
  gradient_accumulation_steps: 5  # effective batch = 30
  warmup_ratio: 0.05
  weight_decay: 0.01
  bf16: true
  gradient_checkpointing: true
  logging_steps: 50
  save_steps: 1000
  eval_steps: 1000
  save_total_limit: 3
  dataloader_num_workers: 4
  seed: 42