Phase 10.8: torchao/bnb quant sweep on iter1-independent. bf16 already
optimal; torchao int8-wo gives -19% VRAM at no F1 cost; all 4-bit
variants collapse (ModernBERT-large too quant-sensitive).
Phase 10.9: ONNX export + ORT eval. Legacy exporter only working path
(dynamo adds 56 Memcpy nodes); ORT fp32 -22% latency vs torch via
kernel fusion but bf16+flash-attn-2 still wins; fp16 broken on rotary;
dynamic int8 silently CPU-fallback + 0.5 F1 collapse.
Driver scripts wired to bun run py:quant / py:onnx; full reports at
results/eval/{quant,onnx}/REPORT.md.
44 lines
1.1 KiB
TOML
44 lines
1.1 KiB
TOML
[project]
|
|
name = "sec-cybert-train"
|
|
version = "0.1.0"
|
|
description = "SEC-cyBERT training pipeline: DAPT, TAPT, fine-tuning, and evaluation"
|
|
readme = "README.md"
|
|
requires-python = ">=3.13,<3.14"
|
|
dependencies = [
|
|
"torch>=2.11,<2.12",
|
|
"torchao>=0.17,<0.18",
|
|
"transformers>=5,<6",
|
|
"datasets>=4,<5",
|
|
"accelerate>=1,<2",
|
|
"pyyaml>=6,<7",
|
|
"flash-attn==2.6.3+cu130torch2.11",
|
|
"unsloth==2026.3.11",
|
|
"coral-pytorch>=1.4.0",
|
|
"scikit-learn>=1.8.0",
|
|
"krippendorff>=0.8.2",
|
|
"matplotlib>=3.10.8",
|
|
"seaborn>=0.13.2",
|
|
"onnx>=1.21.0",
|
|
"onnxruntime-gpu>=1.24.4",
|
|
"onnxruntime>=1.24.4",
|
|
"onnxscript>=0.6.2",
|
|
"onnxconverter-common>=1.16.0",
|
|
]
|
|
|
|
[project.scripts]
|
|
sec-cybert = "main:main"
|
|
|
|
[[tool.uv.index]]
|
|
name = "pytorch-cu130"
|
|
url = "https://download.pytorch.org/whl/cu130"
|
|
explicit = true
|
|
|
|
[[tool.uv.index]]
|
|
url = "https://pypi.org/simple/"
|
|
default = true
|
|
|
|
[tool.uv.sources]
|
|
torch = [ { index = "pytorch-cu130" } ]
|
|
flash-attn = { url = "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.9.4/flash_attn-2.6.3%2Bcu130torch2.11-cp313-cp313-linux_x86_64.whl" }
|
|
|