adding dvc backend so data can be cleanly pulled

2026-03-30 16:53:35 -04:00 · 2026-03-30 16:53:35 -04:00 · c0273c9e2e
commit c0273c9e2e
parent 1dce1ccb73
13 changed files with 460 additions and 2 deletions
--- a/.dvc-store.dvc
+++ b/.dvc-store.dvc
@ -0,0 +1,6 @@
 outs:
 - md5: c633654a20f23d76af34689f7e27d58a.dir
  size: 729964105
  nfiles: 111
  hash: md5
  path: .dvc-store
--- a/.dvc/.gitignore
+++ b/.dvc/.gitignore
@ -0,0 +1,3 @@
 /config.local
 /tmp
 /cache
--- a/.dvc/config
+++ b/.dvc/config
@ -0,0 +1,9 @@
 [core]
    analytics = false
    remote = r2
 ['remote "r2"']
    url = s3://share/sec-cybert
    endpointurl = https://0a665ba1f35a38354b3f623be13f14bd.r2.cloudflarestorage.com
    region = auto
 ['remote "public"']
    url = https://share.lightningcode.dev/sec-cybert
--- a/.dvcignore
+++ b/.dvcignore
@ -0,0 +1,3 @@
 # Add patterns of files dvc should ignore, which could improve
 # the performance. Learn more at
 # https://dvc.org/doc/user-guide/dvcignore
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,10 @@
 # OpenRouter (GenAI labeling pipeline)
 OPENROUTER_API_KEY=""
 # Cloudflare R2 (DVC data storage)
 R2_BUCKET="share"
 R2_ENDPOINT="https://0a665ba1f35a38354b3f623be13f14bd.r2.cloudflarestorage.com"
 R2_PUBLIC_URL="https://share.lightningcode.dev"
 R2_API_TOKEN=""
 R2_ACCESS_KEY_ID=""
 R2_SECRET_ACCESS_KEY=""
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,9 @@
-# Data (too large for git)
+# Data (too large for git — managed by DVC)
 data/
 models/
 checkpoints/
 .dvc-store/
 *.tar.zst
 # Dependencies
 ts/node_modules/
@ -52,3 +54,4 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
 # Finder (MacOS) folder config
 .DS_Store
 python/*.whl
 /.dvc-store
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -55,6 +55,14 @@ All commands run from repo root via `bun run <script>`. No need to cd into subpa
 |--------|-------------|
 | `py:train` | CLI entrypoint (`uv run main.py` — pass subcommand as arg, e.g. `bun run py:train dapt --config ...`) |
 ### Data management (`data:*`)
 | Script | What it does |
 |--------|-------------|
 | `data:push` | Compress `data/` → `.dvc-store/`, DVC add + push to R2 |
 | `data:pull` | DVC pull from R2 + decompress into `data/` |
 | `data:package` | Build standalone `.tar.zst` archives for submission |
 ### Cross-package
 | Script | What it does |
--- a/README.md
+++ b/README.md
@ -0,0 +1,149 @@
 # sec-cyBERT
 Classifier for SEC cybersecurity disclosure quality. Extracts Item 1C / Item 1.05 paragraphs from 10-K and 8-K filings, labels them along two dimensions (content category and specificity), and fine-tunes a ModernBERT-large model via domain-adaptive pre-training (DAPT), task-adaptive pre-training (TAPT), and supervised dual-head classification.
 Three-stage labeling pipeline: synthetic expert panel (3 LLMs via OpenRouter) → judge resolution → human annotation with adjudication.
 ## Quick start
 ```bash
 # Clone and install
 git clone <repo-url> sec-cyBERT && cd sec-cyBERT
 bun install
 # Pull data (no credentials needed, ~700 MB compressed download)
 bun run data:pull
 ```
 That gives you all extracted paragraphs, annotations, the DAPT corpus, benchmark results, and pilot experiments. See [`data/README.md`](data/README.md) for the full manifest.
 ### Prerequisites
 | Tool | Install |
 |------|---------|
 | [Bun](https://bun.sh) ≥1.1 | `curl -fsSL https://bun.sh/install \| bash` |
 | [zstd](https://github.com/facebook/zstd) ≥1.5 | `apt install zstd` / `brew install zstd` |
 Additional prerequisites depending on what you're running:
 | Tool | Needed for | Install |
 |------|-----------|---------|
 | [uv](https://docs.astral.sh/uv/) ≥0.5 | Training pipeline | `curl -LsSf https://astral.sh/uv/install.sh \| sh` |
 | [Docker](https://docs.docker.com/get-docker/) ≥24 | Labelapp (Postgres) | Package manager or Docker Desktop |
 | NVIDIA GPU + CUDA ≥13.0 | DAPT / TAPT / fine-tuning | — |
 ## Project structure
 ```
 sec-cyBERT/
 ├── packages/schemas/     # Shared Zod schemas (@sec-cybert/schemas)
 ├── ts/                   # GenAI labeling pipeline (Vercel AI SDK, OpenRouter)
 ├── python/               # Training pipeline (HuggingFace Trainer, PyTorch)
 │   └── configs/          # YAML training configs
 ├── labelapp/             # Next.js human labeling webapp
 ├── data/                 # All data artifacts (DVC-managed, see data/README.md)
 ├── checkpoints/          # Model training checkpoints
 ├── scripts/              # Data packaging and utility scripts
 └── docs/                 # Project documentation
 ```
 ## Pipeline
 ```
 SEC EDGAR (14,759 filings)
    │
    ▼
 [1] Extract paragraphs ──→ data/paragraphs/ (72,045 paragraphs)
    │
    ▼
 [2] Quality audit + patch ──→ data/paragraphs/quality/, patches/
    │
    ├──→ [3] Stage 1: 3-model annotation ──→ data/annotations/stage1.patched.jsonl
    │         │
    │         ▼
    │    [4] Stage 2: judge resolution ──→ data/annotations/stage2/
    │         │
    │         ▼
    │    [5] Human labeling ──→ data/gold/gold-labels.jsonl
    │
    ├──→ [6] DAPT corpus prep ──→ data/dapt-corpus/ (1.06B tokens)
    │         │
    │         ▼
    │    [7] DAPT ──→ checkpoints/dapt/
    │         │
    │         ▼
    │    [8] TAPT ──→ checkpoints/tapt/
    │
    └──→ [9] Fine-tune dual-head classifier ──→ final model
 ```
 ## Scripts
 All commands run from repo root via `bun run <script>`.
 ### Data extraction and labeling (`ts:*`)
 ```bash
 bun run ts:sec extract:10k    # Extract 10-K Item 1C paragraphs from EDGAR
 bun run ts:sec extract:8k     # Extract 8-K Item 1.05 disclosures
 bun run ts:sec extract:merge  # Merge + deduplicate
 bun run ts:sec label:annotate-all  # Stage 1: 3-model panel annotation (~$116)
 bun run ts:sec label:consensus     # Compute consensus from panel
 bun run ts:sec label:judge         # Stage 2: judge resolution
 ```
 ### Training (`py:*`)
 ```bash
 cd python && uv sync --extra flash  # Install Python deps + flash-attn (pre-built wheel, CUDA ≥13.0)
 cd ..
 bun run py:train dapt --config configs/dapt/modernbert.yaml      # DAPT (~13.5h on RTX 3090)
 bun run py:train tapt --config configs/tapt/modernbert.yaml      # TAPT (~2h)
 bun run py:train finetune --config configs/ft/modernbert.yaml    # Fine-tune classifier
 ```
 ### Data management (`data:*`)
 ```bash
 bun run data:pull       # Download from R2 + decompress (no auth needed)
 bun run data:push       # Compress + upload to R2 via DVC (needs R2 write keys)
 bun run data:package    # Build standalone .tar.zst archives for offline distribution
 ```
 ## Data
 Data is versioned with [DVC](https://dvc.org/) and stored compressed (zstd-19) on Cloudflare R2. `bun run data:pull` fetches everything with no credentials required.
 | Dataset | Records | Description |
 |---------|---------|-------------|
 | Paragraphs | 72,045 | Extracted SEC filing paragraphs with filing metadata |
 | Stage 1 annotations | 150,009 | 3-model panel labels (category + specificity) |
 | DAPT corpus | 14,756 docs | Full 10-K text for masked language model pre-training |
 | Gold labels | *(in progress)* | Human-adjudicated ground truth (1,200 paragraphs) |
 See [`data/README.md`](data/README.md) for schemas, row counts, and reproduction steps for every file.
 ## Labelapp
 The human labeling webapp lives in `labelapp/`. It requires Postgres (via Docker) and has its own setup:
 ```bash
 docker compose up -d       # Start Postgres
 bun run la:db:migrate      # Apply migrations
 bun run la:seed            # Seed paragraphs
 bun run la:assign          # Generate annotator assignments (BIBD)
 bun run la:dev             # Start dev server
 bun run la:export          # Export adjudicated gold labels
 ```
 See [`labelapp/AGENTS.md`](labelapp/AGENTS.md) for labelapp-specific development notes.
 ## Environment variables
 Copy `.env.example` to `.env` and fill in the values you need:
 | Variable | Needed for |
 |----------|-----------|
 | `OPENROUTER_API_KEY` | GenAI labeling pipeline (extraction is free) |
 | `R2_ACCESS_KEY_ID` / `R2_SECRET_ACCESS_KEY` | Pushing data to DVC (pulling is anonymous) |
 | `DATABASE_URL` | Labelapp only (defaults to local Postgres) |
--- a/package.json
+++ b/package.json
@ -20,7 +20,10 @@
    "ts:sec": "bun run --filter sec-cybert sec",
    "ts:typecheck": "bun run --filter sec-cybert typecheck",
    "py:train": "cd python && uv run main.py",
-    "typecheck": "bun run --filter '*' typecheck"
+    "typecheck": "bun run --filter '*' typecheck",
    "data:push": "./scripts/data-push.sh",
    "data:pull": "./scripts/data-pull.sh",
    "data:package": "./scripts/package-data.sh"
  },
  "workspaces": [
    "packages/*",
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@ -22,3 +22,6 @@ sec-cybert = "main:main"
 [[tool.uv.index]]
 url = "https://pypi.org/simple/"
 default = true
 [tool.uv.sources]
 flash-attn = { url = "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.9.4/flash_attn-2.6.3%2Bcu130torch2.11-cp313-cp313-linux_x86_64.whl" }
--- a/scripts/data-pull.sh
+++ b/scripts/data-pull.sh
@ -0,0 +1,57 @@
 #!/usr/bin/env bash
 # DVC pull → decompress .dvc-store/ back into data/.
 #
 # Counterpart: scripts/data-push.sh
 #
 # Usage:
 #   ./scripts/data-pull.sh           # pull + decompress all
 #   ./scripts/data-pull.sh --local   # decompress only (skip dvc pull, use existing cache)
 set -euo pipefail
 REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 DATA_DIR="$REPO_ROOT/data"
 STORE_DIR="$REPO_ROOT/.dvc-store"
 SKIP_PULL=false
 [[ "${1:-}" == "--local" ]] && SKIP_PULL=true
 THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 if ! $SKIP_PULL; then
  echo "=== DVC pull ==="
  cd "$REPO_ROOT"
  # Pull from public HTTP remote (no credentials needed)
  uvx --with 'dvc[s3]' dvc pull -r public
  echo ""
 fi
 if [[ ! -d "$STORE_DIR" ]]; then
  echo "Error: .dvc-store/ not found — run dvc pull first or check .dvc-store.dvc exists" >&2
  exit 1
 fi
 echo "=== Decompressing .dvc-store/ → data/ ==="
 echo "Threads: $THREADS"
 echo ""
 count=0
 while IFS= read -r -d '' zstfile; do
  relpath="${zstfile#$STORE_DIR/}"
  relpath="${relpath%.zst}"  # strip .zst to get original relative path
  dstfile="$DATA_DIR/$relpath"
  dstdir="$(dirname "$dstfile")"
  # Skip if destination exists and is newer than compressed source
  if [[ -f "$dstfile" && "$dstfile" -nt "$zstfile" ]]; then
    continue
  fi
  mkdir -p "$dstdir"
  zstd -d -T"$THREADS" -q --force "$zstfile" -o "$dstfile"
  count=$((count + 1))
 done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
 echo "Decompressed $count files into data/"
 echo ""
 echo "=== Done ==="
--- a/scripts/data-push.sh
+++ b/scripts/data-push.sh
@ -0,0 +1,119 @@
 #!/usr/bin/env bash
 # Compress data/ → .dvc-store/, then DVC add + push.
 #
 # Working files in data/ stay untouched. DVC tracks compressed copies.
 # Counterpart: scripts/data-pull.sh
 #
 # Usage:
 #   ./scripts/data-push.sh           # compress, add, push
 #   ./scripts/data-push.sh --dry-run # show what would be compressed
 set -euo pipefail
 REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 DATA_DIR="$REPO_ROOT/data"
 STORE_DIR="$REPO_ROOT/.dvc-store"
 DRY_RUN=false
 [[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
 THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 # Directories to track (everything except raw HTML, bulk metadata, and empty placeholders)
 TRACK_DIRS=(
  paragraphs
  annotations
  gold
  dapt-corpus
  analysis
  bench
  pilot
 )
 compress_dir() {
  local reldir="$1"
  local srcdir="$DATA_DIR/$reldir"
  local dstdir="$STORE_DIR/$reldir"
  if [[ ! -d "$srcdir" ]]; then
    echo "  skip $reldir/ (not found)"
    return
  fi
  # Find all files (not dirs) in the source
  while IFS= read -r -d '' srcfile; do
    local relpath="${srcfile#$DATA_DIR/}"
    local dstfile="$STORE_DIR/${relpath}.zst"
    local dstdir_for_file="$(dirname "$dstfile")"
    # Skip if compressed version exists and is newer than source
    if [[ -f "$dstfile" && "$dstfile" -nt "$srcfile" ]]; then
      continue
    fi
    if $DRY_RUN; then
      local srcsize=$(stat -c%s "$srcfile" 2>/dev/null || stat -f%z "$srcfile")
      echo "  would compress: $relpath ($(numfmt --to=iec "$srcsize" 2>/dev/null || echo "${srcsize}B"))"
    else
      mkdir -p "$dstdir_for_file"
      zstd -19 -T"$THREADS" -q --force "$srcfile" -o "$dstfile"
    fi
  done < <(find "$srcdir" -type f -not -name '*.zst' -print0)
 }
 # Remove stale compressed files whose source no longer exists
 prune_stale() {
  if [[ ! -d "$STORE_DIR" ]]; then return; fi
  while IFS= read -r -d '' zstfile; do
    local relpath="${zstfile#$STORE_DIR/}"
    relpath="${relpath%.zst}"  # strip .zst suffix to get original path
    local srcfile="$DATA_DIR/$relpath"
    if [[ ! -f "$srcfile" ]]; then
      if $DRY_RUN; then
        echo "  would prune: $relpath.zst (source deleted)"
      else
        rm "$zstfile"
        echo "  pruned: $relpath.zst"
      fi
    fi
  done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
  # Remove empty directories
  if ! $DRY_RUN; then
    find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true
  fi
 }
 echo "=== Compressing data/ → .dvc-store/ ==="
 echo "Threads: $THREADS, zstd level: 19"
 echo ""
 for dir in "${TRACK_DIRS[@]}"; do
  echo "[$dir/]"
  compress_dir "$dir"
 done
 echo ""
 echo "Pruning stale files..."
 prune_stale
 if $DRY_RUN; then
  echo ""
  echo "(dry run — nothing written)"
  exit 0
 fi
 echo ""
 echo "=== DVC add + push ==="
 cd "$REPO_ROOT"
 uvx --with 'dvc[s3]' dvc add .dvc-store/
 echo ""
 uvx --with 'dvc[s3]' dvc push
 echo ""
 echo "=== Done ==="
 echo "Commit .dvc-store.dvc and .gitignore if changed:"
 echo "  git add .dvc-store.dvc .gitignore && git commit -m 'data: update dvc-tracked data'"
--- a/scripts/package-data.sh
+++ b/scripts/package-data.sh
@ -0,0 +1,85 @@
 #!/usr/bin/env bash
 # Package sec-cyBERT data into compressed archives for distribution.
 #
 # Produces two archives:
 #   sec-cybert-data.tar.zst         — paragraphs, annotations, gold, bench, pilot, analysis, patches, quality
 #   sec-cybert-dapt-corpus.tar.zst  — DAPT corpus shards (separate due to size)
 #
 # Usage:
 #   ./scripts/package-data.sh [output-dir]
 #
 # Default output-dir is the repo root.
 set -euo pipefail
 REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 OUTPUT_DIR="${1:-$REPO_ROOT}"
 DATA_DIR="$REPO_ROOT/data"
 # Verify data directory exists
 if [[ ! -d "$DATA_DIR" ]]; then
  echo "Error: data/ directory not found at $DATA_DIR" >&2
  exit 1
 fi
 # Detect thread count for parallel compression
 THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 echo "Using $THREADS threads for compression"
 # zstd level 19 = high compression, -T for threads
 ZSTD_OPTS="-19 -T$THREADS"
 echo ""
 echo "=== Archive 1: sec-cybert-data.tar.zst ==="
 echo "Includes: paragraphs, annotations, gold, bench, pilot, analysis"
 echo ""
 # Build file list for main archive — everything except raw/, bulk/, dapt-corpus/
 # Use tar --exclude to skip the large/downloadable directories
 tar \
  --create \
  --file - \
  --directory "$REPO_ROOT" \
  --exclude='data/raw' \
  --exclude='data/bulk' \
  --exclude='data/dapt-corpus' \
  --exclude='data/extracted' \
  --exclude='data/metadata' \
  --exclude='data/splits' \
  --exclude='data/benchmark' \
  data/ \
  | zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-data.tar.zst"
 MAIN_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-data.tar.zst" 2>/dev/null \
  || stat -f%z "$OUTPUT_DIR/sec-cybert-data.tar.zst")
 echo "Created: $OUTPUT_DIR/sec-cybert-data.tar.zst ($(numfmt --to=iec "$MAIN_SIZE" 2>/dev/null || echo "$MAIN_SIZE bytes"))"
 echo ""
 echo "=== Archive 2: sec-cybert-dapt-corpus.tar.zst ==="
 echo "Includes: dapt-corpus/ shards only"
 echo ""
 # Check if DAPT corpus exists
 if [[ ! -d "$DATA_DIR/dapt-corpus" ]] || [[ -z "$(ls "$DATA_DIR/dapt-corpus/"*.jsonl 2>/dev/null)" ]]; then
  echo "Warning: data/dapt-corpus/ is empty or missing — skipping DAPT archive"
 else
  tar \
    --create \
    --file - \
    --directory "$REPO_ROOT" \
    data/dapt-corpus/ \
    | zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst"
  DAPT_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst" 2>/dev/null \
    || stat -f%z "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst")
  echo "Created: $OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst ($(numfmt --to=iec "$DAPT_SIZE" 2>/dev/null || echo "$DAPT_SIZE bytes"))"
 fi
 echo ""
 echo "=== Done ==="
 echo ""
 echo "To extract:"
 echo "  tar --zstd -xf sec-cybert-data.tar.zst"
 echo "  tar --zstd -xf sec-cybert-dapt-corpus.tar.zst"
 echo ""
 echo "Both archives extract with data/ as the root prefix."