#!/usr/bin/env bash # Compress data/ → .dvc-store/, then DVC add + push. # # Working files in data/ stay untouched. DVC tracks compressed copies. # Counterpart: scripts/data-pull.sh # # Usage: # ./scripts/data-push.sh # compress, add, push # ./scripts/data-push.sh --dry-run # show what would be compressed set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" DATA_DIR="$REPO_ROOT/data" STORE_DIR="$REPO_ROOT/.dvc-store" DRY_RUN=false [[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) # Directories to track (everything except raw HTML, bulk metadata, and empty placeholders) TRACK_DIRS=( paragraphs annotations gold dapt-corpus analysis bench pilot ) compress_dir() { local reldir="$1" local srcdir="$DATA_DIR/$reldir" local dstdir="$STORE_DIR/$reldir" if [[ ! -d "$srcdir" ]]; then echo " skip $reldir/ (not found)" return fi # Find all files (not dirs) in the source while IFS= read -r -d '' srcfile; do local relpath="${srcfile#$DATA_DIR/}" local dstfile="$STORE_DIR/${relpath}.zst" local dstdir_for_file="$(dirname "$dstfile")" # Skip if compressed version exists and is newer than source if [[ -f "$dstfile" && "$dstfile" -nt "$srcfile" ]]; then continue fi if $DRY_RUN; then local srcsize=$(stat -c%s "$srcfile" 2>/dev/null || stat -f%z "$srcfile") echo " would compress: $relpath ($(numfmt --to=iec "$srcsize" 2>/dev/null || echo "${srcsize}B"))" else mkdir -p "$dstdir_for_file" zstd -19 -T"$THREADS" -q --force "$srcfile" -o "$dstfile" fi done < <(find "$srcdir" -type f -not -name '*.zst' -print0) } # Remove stale compressed files whose source no longer exists prune_stale() { if [[ ! -d "$STORE_DIR" ]]; then return; fi while IFS= read -r -d '' zstfile; do local relpath="${zstfile#$STORE_DIR/}" relpath="${relpath%.zst}" # strip .zst suffix to get original path local srcfile="$DATA_DIR/$relpath" if [[ ! -f "$srcfile" ]]; then if $DRY_RUN; then echo " would prune: $relpath.zst (source deleted)" else rm "$zstfile" echo " pruned: $relpath.zst" fi fi done < <(find "$STORE_DIR" -name '*.zst' -type f -print0) # Remove empty directories if ! $DRY_RUN; then find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true fi } echo "=== Compressing data/ → .dvc-store/ ===" echo "Threads: $THREADS, zstd level: 19" echo "" for dir in "${TRACK_DIRS[@]}"; do echo "[$dir/]" compress_dir "$dir" done echo "" echo "Pruning stale files..." prune_stale if $DRY_RUN; then echo "" echo "(dry run — nothing written)" exit 0 fi echo "" echo "=== DVC add + push ===" cd "$REPO_ROOT" uvx --with 'dvc[s3]' dvc add .dvc-store/ echo "" uvx --with 'dvc[s3]' dvc push echo "" echo "=== Done ===" echo "Commit .dvc-store.dvc and .gitignore if changed:" echo " git add .dvc-store.dvc .gitignore && git commit -m 'data: update dvc-tracked data'"