SEC-cyBERT/scripts/data-push.sh
2026-03-30 16:53:35 -04:00

120 lines
3.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# Compress data/ → .dvc-store/, then DVC add + push.
#
# Working files in data/ stay untouched. DVC tracks compressed copies.
# Counterpart: scripts/data-pull.sh
#
# Usage:
# ./scripts/data-push.sh # compress, add, push
# ./scripts/data-push.sh --dry-run # show what would be compressed
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DATA_DIR="$REPO_ROOT/data"
STORE_DIR="$REPO_ROOT/.dvc-store"
DRY_RUN=false
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
# Directories to track (everything except raw HTML, bulk metadata, and empty placeholders)
TRACK_DIRS=(
paragraphs
annotations
gold
dapt-corpus
analysis
bench
pilot
)
compress_dir() {
local reldir="$1"
local srcdir="$DATA_DIR/$reldir"
local dstdir="$STORE_DIR/$reldir"
if [[ ! -d "$srcdir" ]]; then
echo " skip $reldir/ (not found)"
return
fi
# Find all files (not dirs) in the source
while IFS= read -r -d '' srcfile; do
local relpath="${srcfile#$DATA_DIR/}"
local dstfile="$STORE_DIR/${relpath}.zst"
local dstdir_for_file="$(dirname "$dstfile")"
# Skip if compressed version exists and is newer than source
if [[ -f "$dstfile" && "$dstfile" -nt "$srcfile" ]]; then
continue
fi
if $DRY_RUN; then
local srcsize=$(stat -c%s "$srcfile" 2>/dev/null || stat -f%z "$srcfile")
echo " would compress: $relpath ($(numfmt --to=iec "$srcsize" 2>/dev/null || echo "${srcsize}B"))"
else
mkdir -p "$dstdir_for_file"
zstd -19 -T"$THREADS" -q --force "$srcfile" -o "$dstfile"
fi
done < <(find "$srcdir" -type f -not -name '*.zst' -print0)
}
# Remove stale compressed files whose source no longer exists
prune_stale() {
if [[ ! -d "$STORE_DIR" ]]; then return; fi
while IFS= read -r -d '' zstfile; do
local relpath="${zstfile#$STORE_DIR/}"
relpath="${relpath%.zst}" # strip .zst suffix to get original path
local srcfile="$DATA_DIR/$relpath"
if [[ ! -f "$srcfile" ]]; then
if $DRY_RUN; then
echo " would prune: $relpath.zst (source deleted)"
else
rm "$zstfile"
echo " pruned: $relpath.zst"
fi
fi
done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
# Remove empty directories
if ! $DRY_RUN; then
find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true
fi
}
echo "=== Compressing data/ → .dvc-store/ ==="
echo "Threads: $THREADS, zstd level: 19"
echo ""
for dir in "${TRACK_DIRS[@]}"; do
echo "[$dir/]"
compress_dir "$dir"
done
echo ""
echo "Pruning stale files..."
prune_stale
if $DRY_RUN; then
echo ""
echo "(dry run — nothing written)"
exit 0
fi
echo ""
echo "=== DVC add + push ==="
cd "$REPO_ROOT"
uvx --with 'dvc[s3]' dvc add .dvc-store/
echo ""
uvx --with 'dvc[s3]' dvc push
echo ""
echo "=== Done ==="
echo "Commit .dvc-store.dvc and .gitignore if changed:"
echo " git add .dvc-store.dvc .gitignore && git commit -m 'data: update dvc-tracked data'"