SEC-cyBERT/scripts/package-data.sh
2026-03-30 16:53:35 -04:00

86 lines
2.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Package sec-cyBERT data into compressed archives for distribution.
#
# Produces two archives:
# sec-cybert-data.tar.zst — paragraphs, annotations, gold, bench, pilot, analysis, patches, quality
# sec-cybert-dapt-corpus.tar.zst — DAPT corpus shards (separate due to size)
#
# Usage:
# ./scripts/package-data.sh [output-dir]
#
# Default output-dir is the repo root.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
OUTPUT_DIR="${1:-$REPO_ROOT}"
DATA_DIR="$REPO_ROOT/data"
# Verify data directory exists
if [[ ! -d "$DATA_DIR" ]]; then
echo "Error: data/ directory not found at $DATA_DIR" >&2
exit 1
fi
# Detect thread count for parallel compression
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
echo "Using $THREADS threads for compression"
# zstd level 19 = high compression, -T for threads
ZSTD_OPTS="-19 -T$THREADS"
echo ""
echo "=== Archive 1: sec-cybert-data.tar.zst ==="
echo "Includes: paragraphs, annotations, gold, bench, pilot, analysis"
echo ""
# Build file list for main archive — everything except raw/, bulk/, dapt-corpus/
# Use tar --exclude to skip the large/downloadable directories
tar \
--create \
--file - \
--directory "$REPO_ROOT" \
--exclude='data/raw' \
--exclude='data/bulk' \
--exclude='data/dapt-corpus' \
--exclude='data/extracted' \
--exclude='data/metadata' \
--exclude='data/splits' \
--exclude='data/benchmark' \
data/ \
| zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-data.tar.zst"
MAIN_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-data.tar.zst" 2>/dev/null \
|| stat -f%z "$OUTPUT_DIR/sec-cybert-data.tar.zst")
echo "Created: $OUTPUT_DIR/sec-cybert-data.tar.zst ($(numfmt --to=iec "$MAIN_SIZE" 2>/dev/null || echo "$MAIN_SIZE bytes"))"
echo ""
echo "=== Archive 2: sec-cybert-dapt-corpus.tar.zst ==="
echo "Includes: dapt-corpus/ shards only"
echo ""
# Check if DAPT corpus exists
if [[ ! -d "$DATA_DIR/dapt-corpus" ]] || [[ -z "$(ls "$DATA_DIR/dapt-corpus/"*.jsonl 2>/dev/null)" ]]; then
echo "Warning: data/dapt-corpus/ is empty or missing — skipping DAPT archive"
else
tar \
--create \
--file - \
--directory "$REPO_ROOT" \
data/dapt-corpus/ \
| zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst"
DAPT_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst" 2>/dev/null \
|| stat -f%z "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst")
echo "Created: $OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst ($(numfmt --to=iec "$DAPT_SIZE" 2>/dev/null || echo "$DAPT_SIZE bytes"))"
fi
echo ""
echo "=== Done ==="
echo ""
echo "To extract:"
echo " tar --zstd -xf sec-cybert-data.tar.zst"
echo " tar --zstd -xf sec-cybert-dapt-corpus.tar.zst"
echo ""
echo "Both archives extract with data/ as the root prefix."