#!/usr/bin/env bash # Package sec-cyBERT data into compressed archives for distribution. # # Produces two archives: # sec-cybert-data.tar.zst — paragraphs, annotations, gold, bench, pilot, analysis, patches, quality # sec-cybert-dapt-corpus.tar.zst — DAPT corpus shards (separate due to size) # # Usage: # ./scripts/package-data.sh [output-dir] # # Default output-dir is the repo root. set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" OUTPUT_DIR="${1:-$REPO_ROOT}" DATA_DIR="$REPO_ROOT/data" # Verify data directory exists if [[ ! -d "$DATA_DIR" ]]; then echo "Error: data/ directory not found at $DATA_DIR" >&2 exit 1 fi # Detect thread count for parallel compression THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) echo "Using $THREADS threads for compression" # zstd level 19 = high compression, -T for threads ZSTD_OPTS="-19 -T$THREADS" echo "" echo "=== Archive 1: sec-cybert-data.tar.zst ===" echo "Includes: paragraphs, annotations, gold, bench, pilot, analysis" echo "" # Build file list for main archive — everything except raw/, bulk/, dapt-corpus/ # Use tar --exclude to skip the large/downloadable directories tar \ --create \ --file - \ --directory "$REPO_ROOT" \ --exclude='data/raw' \ --exclude='data/bulk' \ --exclude='data/dapt-corpus' \ --exclude='data/extracted' \ --exclude='data/metadata' \ --exclude='data/splits' \ --exclude='data/benchmark' \ data/ \ | zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-data.tar.zst" MAIN_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-data.tar.zst" 2>/dev/null \ || stat -f%z "$OUTPUT_DIR/sec-cybert-data.tar.zst") echo "Created: $OUTPUT_DIR/sec-cybert-data.tar.zst ($(numfmt --to=iec "$MAIN_SIZE" 2>/dev/null || echo "$MAIN_SIZE bytes"))" echo "" echo "=== Archive 2: sec-cybert-dapt-corpus.tar.zst ===" echo "Includes: dapt-corpus/ shards only" echo "" # Check if DAPT corpus exists if [[ ! -d "$DATA_DIR/dapt-corpus" ]] || [[ -z "$(ls "$DATA_DIR/dapt-corpus/"*.jsonl 2>/dev/null)" ]]; then echo "Warning: data/dapt-corpus/ is empty or missing — skipping DAPT archive" else tar \ --create \ --file - \ --directory "$REPO_ROOT" \ data/dapt-corpus/ \ | zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst" DAPT_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst" 2>/dev/null \ || stat -f%z "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst") echo "Created: $OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst ($(numfmt --to=iec "$DAPT_SIZE" 2>/dev/null || echo "$DAPT_SIZE bytes"))" fi echo "" echo "=== Done ===" echo "" echo "To extract:" echo " tar --zstd -xf sec-cybert-data.tar.zst" echo " tar --zstd -xf sec-cybert-dapt-corpus.tar.zst" echo "" echo "Both archives extract with data/ as the root prefix."