SEC-cyBERT/scripts/data-pull.sh
2026-03-30 16:53:35 -04:00

58 lines
1.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# DVC pull → decompress .dvc-store/ back into data/.
#
# Counterpart: scripts/data-push.sh
#
# Usage:
# ./scripts/data-pull.sh # pull + decompress all
# ./scripts/data-pull.sh --local # decompress only (skip dvc pull, use existing cache)
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DATA_DIR="$REPO_ROOT/data"
STORE_DIR="$REPO_ROOT/.dvc-store"
SKIP_PULL=false
[[ "${1:-}" == "--local" ]] && SKIP_PULL=true
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
if ! $SKIP_PULL; then
echo "=== DVC pull ==="
cd "$REPO_ROOT"
# Pull from public HTTP remote (no credentials needed)
uvx --with 'dvc[s3]' dvc pull -r public
echo ""
fi
if [[ ! -d "$STORE_DIR" ]]; then
echo "Error: .dvc-store/ not found — run dvc pull first or check .dvc-store.dvc exists" >&2
exit 1
fi
echo "=== Decompressing .dvc-store/ → data/ ==="
echo "Threads: $THREADS"
echo ""
count=0
while IFS= read -r -d '' zstfile; do
relpath="${zstfile#$STORE_DIR/}"
relpath="${relpath%.zst}" # strip .zst to get original relative path
dstfile="$DATA_DIR/$relpath"
dstdir="$(dirname "$dstfile")"
# Skip if destination exists and is newer than compressed source
if [[ -f "$dstfile" && "$dstfile" -nt "$zstfile" ]]; then
continue
fi
mkdir -p "$dstdir"
zstd -d -T"$THREADS" -q --force "$zstfile" -o "$dstfile"
count=$((count + 1))
done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
echo "Decompressed $count files into data/"
echo ""
echo "=== Done ==="