#!/usr/bin/env bash # Pull LFS objects and decompress .lfs-store/ → data/ and checkpoints/. # Uses xxh3 manifest for skip logic — only decompresses files that changed. # # Counterpart: scripts/data-push.sh # # Usage: # ./scripts/data-pull.sh # data only (default) # ./scripts/data-pull.sh --checkpoints-only # checkpoints only # ./scripts/data-pull.sh --all # data + checkpoints # ./scripts/data-pull.sh --local # decompress only (skip git pull) set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" STORE_DIR="$REPO_ROOT/.lfs-store" MANIFEST="$REPO_ROOT/.lfs-manifest" THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) # --- Parse flags ----------------------------------------------------------------- MODE="data-only" SKIP_PULL=false for arg in "$@"; do case "$arg" in --data-only) MODE="data-only" ;; --checkpoints-only) MODE="checkpoints-only" ;; --all) MODE="all" ;; --local) SKIP_PULL=true ;; *) echo "Unknown flag: $arg" >&2; exit 1 ;; esac done # --- Helpers --------------------------------------------------------------------- scope_prefixes() { case "$MODE" in data-only) echo "data/" ;; checkpoints-only) echo "checkpoints/" ;; all) echo "data/"; echo "checkpoints/" ;; esac } in_scope() { local path="$1" local prefix while IFS= read -r prefix; do [[ "$path" == "$prefix"* ]] && return 0 done < <(scope_prefixes) return 1 } # Build --include patterns for git lfs pull lfs_include_patterns() { local patterns=() local prefix while IFS= read -r prefix; do patterns+=("--include=.lfs-store/${prefix}**") done < <(scope_prefixes) echo "${patterns[@]}" } # --- Main ------------------------------------------------------------------------ TMP_DIR=$(mktemp -d) trap 'rm -rf "$TMP_DIR"' EXIT echo "=== LFS pull (mode: $MODE) ===" echo "Threads: $THREADS" echo "" # 1. Git pull (fetch manifest + LFS pointers) if ! $SKIP_PULL; then echo "Pulling from remote..." cd "$REPO_ROOT" git pull echo "" echo "Fetching LFS objects..." # shellcheck disable=SC2046 git lfs pull $(lfs_include_patterns) echo "" fi # 2. Verify manifest exists if [[ ! -f "$MANIFEST" ]]; then echo "Error: .lfs-manifest not found — run git pull first or check that data has been pushed" >&2 exit 1 fi if [[ ! -d "$STORE_DIR" ]]; then echo "Error: .lfs-store/ not found — run git lfs pull first" >&2 exit 1 fi # 3. Read manifest, find files needing decompression echo "Checking which files need decompression..." TO_DECOMPRESS=() while IFS=$'\t' read -r relpath hash size ts; do [[ "$relpath" == "#"* || -z "$relpath" ]] && continue in_scope "$relpath" || continue zstfile="$STORE_DIR/${relpath}.zst" if [[ ! -f "$zstfile" ]]; then echo " warning: missing compressed file: ${relpath}.zst" >&2 continue fi dst="$REPO_ROOT/$relpath" # Skip if destination exists and hash matches if [[ -f "$dst" ]]; then current_hash=$(xxh3sum "$dst" | awk '{print $1}') current_hash="${current_hash#XXH3_}" if [[ "$current_hash" == "$hash" ]]; then continue fi fi TO_DECOMPRESS+=("$relpath") done < "$MANIFEST" echo " ${#TO_DECOMPRESS[@]} files to decompress" if [[ ${#TO_DECOMPRESS[@]} -eq 0 ]]; then echo "" echo "Everything up to date." exit 0 fi # 4. Decompress in parallel echo "" echo "Decompressing..." printf '%s\n' "${TO_DECOMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c ' src="'"$STORE_DIR"'/{}.zst" dst="'"$REPO_ROOT"'/{}" mkdir -p "$(dirname "$dst")" zstd -d -q --force "$src" -o "$dst" echo " decompressed: {}" ' echo "" echo "=== Done ===" echo "Decompressed ${#TO_DECOMPRESS[@]} files"