Replace DVC pipeline with Git LFS on self-hosted Gitea. New scripts use per-file xxh3 hashing for change detection and parallel zstd-19 compression. Supports separate data/checkpoint push modes.
146 lines
3.7 KiB
Bash
Executable File
146 lines
3.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Pull LFS objects and decompress .lfs-store/ → data/ and checkpoints/.
|
|
# Uses xxh3 manifest for skip logic — only decompresses files that changed.
|
|
#
|
|
# Counterpart: scripts/data-push.sh
|
|
#
|
|
# Usage:
|
|
# ./scripts/data-pull.sh # data only (default)
|
|
# ./scripts/data-pull.sh --checkpoints-only # checkpoints only
|
|
# ./scripts/data-pull.sh --all # data + checkpoints
|
|
# ./scripts/data-pull.sh --local # decompress only (skip git pull)
|
|
|
|
set -euo pipefail
|
|
|
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
STORE_DIR="$REPO_ROOT/.lfs-store"
|
|
MANIFEST="$REPO_ROOT/.lfs-manifest"
|
|
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
|
|
|
# --- Parse flags -----------------------------------------------------------------
|
|
MODE="data-only"
|
|
SKIP_PULL=false
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--data-only) MODE="data-only" ;;
|
|
--checkpoints-only) MODE="checkpoints-only" ;;
|
|
--all) MODE="all" ;;
|
|
--local) SKIP_PULL=true ;;
|
|
*) echo "Unknown flag: $arg" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# --- Helpers ---------------------------------------------------------------------
|
|
|
|
scope_prefixes() {
|
|
case "$MODE" in
|
|
data-only) echo "data/" ;;
|
|
checkpoints-only) echo "checkpoints/" ;;
|
|
all) echo "data/"; echo "checkpoints/" ;;
|
|
esac
|
|
}
|
|
|
|
in_scope() {
|
|
local path="$1"
|
|
local prefix
|
|
while IFS= read -r prefix; do
|
|
[[ "$path" == "$prefix"* ]] && return 0
|
|
done < <(scope_prefixes)
|
|
return 1
|
|
}
|
|
|
|
# Build --include patterns for git lfs pull
|
|
lfs_include_patterns() {
|
|
local patterns=()
|
|
local prefix
|
|
while IFS= read -r prefix; do
|
|
patterns+=("--include=.lfs-store/${prefix}**")
|
|
done < <(scope_prefixes)
|
|
echo "${patterns[@]}"
|
|
}
|
|
|
|
# --- Main ------------------------------------------------------------------------
|
|
|
|
TMP_DIR=$(mktemp -d)
|
|
trap 'rm -rf "$TMP_DIR"' EXIT
|
|
|
|
echo "=== LFS pull (mode: $MODE) ==="
|
|
echo "Threads: $THREADS"
|
|
echo ""
|
|
|
|
# 1. Git pull (fetch manifest + LFS pointers)
|
|
if ! $SKIP_PULL; then
|
|
echo "Pulling from remote..."
|
|
cd "$REPO_ROOT"
|
|
git pull
|
|
echo ""
|
|
|
|
echo "Fetching LFS objects..."
|
|
# shellcheck disable=SC2046
|
|
git lfs pull $(lfs_include_patterns)
|
|
echo ""
|
|
fi
|
|
|
|
# 2. Verify manifest exists
|
|
if [[ ! -f "$MANIFEST" ]]; then
|
|
echo "Error: .lfs-manifest not found — run git pull first or check that data has been pushed" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -d "$STORE_DIR" ]]; then
|
|
echo "Error: .lfs-store/ not found — run git lfs pull first" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# 3. Read manifest, find files needing decompression
|
|
echo "Checking which files need decompression..."
|
|
TO_DECOMPRESS=()
|
|
|
|
while IFS=$'\t' read -r relpath hash size ts; do
|
|
[[ "$relpath" == "#"* || -z "$relpath" ]] && continue
|
|
in_scope "$relpath" || continue
|
|
|
|
zstfile="$STORE_DIR/${relpath}.zst"
|
|
if [[ ! -f "$zstfile" ]]; then
|
|
echo " warning: missing compressed file: ${relpath}.zst" >&2
|
|
continue
|
|
fi
|
|
|
|
dst="$REPO_ROOT/$relpath"
|
|
|
|
# Skip if destination exists and hash matches
|
|
if [[ -f "$dst" ]]; then
|
|
current_hash=$(xxh3sum "$dst" | awk '{print $1}')
|
|
current_hash="${current_hash#XXH3_}"
|
|
if [[ "$current_hash" == "$hash" ]]; then
|
|
continue
|
|
fi
|
|
fi
|
|
|
|
TO_DECOMPRESS+=("$relpath")
|
|
done < "$MANIFEST"
|
|
|
|
echo " ${#TO_DECOMPRESS[@]} files to decompress"
|
|
|
|
if [[ ${#TO_DECOMPRESS[@]} -eq 0 ]]; then
|
|
echo ""
|
|
echo "Everything up to date."
|
|
exit 0
|
|
fi
|
|
|
|
# 4. Decompress in parallel
|
|
echo ""
|
|
echo "Decompressing..."
|
|
printf '%s\n' "${TO_DECOMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c '
|
|
src="'"$STORE_DIR"'/{}.zst"
|
|
dst="'"$REPO_ROOT"'/{}"
|
|
mkdir -p "$(dirname "$dst")"
|
|
zstd -d -q --force "$src" -o "$dst"
|
|
echo " decompressed: {}"
|
|
'
|
|
|
|
echo ""
|
|
echo "=== Done ==="
|
|
echo "Decompressed ${#TO_DECOMPRESS[@]} files"
|