86 lines
2.7 KiB
Bash
Executable File
86 lines
2.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Package sec-cyBERT data into compressed archives for distribution.
|
|
#
|
|
# Produces two archives:
|
|
# sec-cybert-data.tar.zst — paragraphs, annotations, gold, bench, pilot, analysis, patches, quality
|
|
# sec-cybert-dapt-corpus.tar.zst — DAPT corpus shards (separate due to size)
|
|
#
|
|
# Usage:
|
|
# ./scripts/package-data.sh [output-dir]
|
|
#
|
|
# Default output-dir is the repo root.
|
|
|
|
set -euo pipefail
|
|
|
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
OUTPUT_DIR="${1:-$REPO_ROOT}"
|
|
DATA_DIR="$REPO_ROOT/data"
|
|
|
|
# Verify data directory exists
|
|
if [[ ! -d "$DATA_DIR" ]]; then
|
|
echo "Error: data/ directory not found at $DATA_DIR" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Detect thread count for parallel compression
|
|
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
|
echo "Using $THREADS threads for compression"
|
|
|
|
# zstd level 19 = high compression, -T for threads
|
|
ZSTD_OPTS="-19 -T$THREADS"
|
|
|
|
echo ""
|
|
echo "=== Archive 1: sec-cybert-data.tar.zst ==="
|
|
echo "Includes: paragraphs, annotations, gold, bench, pilot, analysis"
|
|
echo ""
|
|
|
|
# Build file list for main archive — everything except raw/, bulk/, dapt-corpus/
|
|
# Use tar --exclude to skip the large/downloadable directories
|
|
tar \
|
|
--create \
|
|
--file - \
|
|
--directory "$REPO_ROOT" \
|
|
--exclude='data/raw' \
|
|
--exclude='data/bulk' \
|
|
--exclude='data/dapt-corpus' \
|
|
--exclude='data/extracted' \
|
|
--exclude='data/metadata' \
|
|
--exclude='data/splits' \
|
|
--exclude='data/benchmark' \
|
|
data/ \
|
|
| zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-data.tar.zst"
|
|
|
|
MAIN_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-data.tar.zst" 2>/dev/null \
|
|
|| stat -f%z "$OUTPUT_DIR/sec-cybert-data.tar.zst")
|
|
echo "Created: $OUTPUT_DIR/sec-cybert-data.tar.zst ($(numfmt --to=iec "$MAIN_SIZE" 2>/dev/null || echo "$MAIN_SIZE bytes"))"
|
|
|
|
echo ""
|
|
echo "=== Archive 2: sec-cybert-dapt-corpus.tar.zst ==="
|
|
echo "Includes: dapt-corpus/ shards only"
|
|
echo ""
|
|
|
|
# Check if DAPT corpus exists
|
|
if [[ ! -d "$DATA_DIR/dapt-corpus" ]] || [[ -z "$(ls "$DATA_DIR/dapt-corpus/"*.jsonl 2>/dev/null)" ]]; then
|
|
echo "Warning: data/dapt-corpus/ is empty or missing — skipping DAPT archive"
|
|
else
|
|
tar \
|
|
--create \
|
|
--file - \
|
|
--directory "$REPO_ROOT" \
|
|
data/dapt-corpus/ \
|
|
| zstd $ZSTD_OPTS -o "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst"
|
|
|
|
DAPT_SIZE=$(stat -c%s "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst" 2>/dev/null \
|
|
|| stat -f%z "$OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst")
|
|
echo "Created: $OUTPUT_DIR/sec-cybert-dapt-corpus.tar.zst ($(numfmt --to=iec "$DAPT_SIZE" 2>/dev/null || echo "$DAPT_SIZE bytes"))"
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== Done ==="
|
|
echo ""
|
|
echo "To extract:"
|
|
echo " tar --zstd -xf sec-cybert-data.tar.zst"
|
|
echo " tar --zstd -xf sec-cybert-dapt-corpus.tar.zst"
|
|
echo ""
|
|
echo "Both archives extract with data/ as the root prefix."
|