#!/usr/bin/env bash # DVC pull → decompress .dvc-store/ back into data/. # # Counterpart: scripts/data-push.sh # # Usage: # ./scripts/data-pull.sh # pull + decompress all # ./scripts/data-pull.sh --local # decompress only (skip dvc pull, use existing cache) set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" DATA_DIR="$REPO_ROOT/data" STORE_DIR="$REPO_ROOT/.dvc-store" SKIP_PULL=false [[ "${1:-}" == "--local" ]] && SKIP_PULL=true THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) if ! $SKIP_PULL; then echo "=== DVC pull ===" cd "$REPO_ROOT" # Pull from public HTTP remote (no credentials needed) uvx --with 'dvc[s3]' dvc pull -r public echo "" fi if [[ ! -d "$STORE_DIR" ]]; then echo "Error: .dvc-store/ not found — run dvc pull first or check .dvc-store.dvc exists" >&2 exit 1 fi echo "=== Decompressing .dvc-store/ → data/ ===" echo "Threads: $THREADS" echo "" count=0 while IFS= read -r -d '' zstfile; do relpath="${zstfile#$STORE_DIR/}" relpath="${relpath%.zst}" # strip .zst to get original relative path dstfile="$DATA_DIR/$relpath" dstdir="$(dirname "$dstfile")" # Skip if destination exists and is newer than compressed source if [[ -f "$dstfile" && "$dstfile" -nt "$zstfile" ]]; then continue fi mkdir -p "$dstdir" zstd -d -T"$THREADS" -q --force "$zstfile" -o "$dstfile" count=$((count + 1)) done < <(find "$STORE_DIR" -name '*.zst' -type f -print0) echo "Decompressed $count files into data/" echo "" echo "=== Done ==="