SEC-cyBERT/scripts/data-pull.sh
Joey Eamigh a5f06f2db7
infra: migrate from DVC to Git LFS with xxh3 change detection
Replace DVC pipeline with Git LFS on self-hosted Gitea. New scripts
use per-file xxh3 hashing for change detection and parallel zstd-19
compression. Supports separate data/checkpoint push modes.
2026-04-05 16:21:14 -04:00

146 lines
3.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Pull LFS objects and decompress .lfs-store/ → data/ and checkpoints/.
# Uses xxh3 manifest for skip logic — only decompresses files that changed.
#
# Counterpart: scripts/data-push.sh
#
# Usage:
# ./scripts/data-pull.sh # data only (default)
# ./scripts/data-pull.sh --checkpoints-only # checkpoints only
# ./scripts/data-pull.sh --all # data + checkpoints
# ./scripts/data-pull.sh --local # decompress only (skip git pull)
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
STORE_DIR="$REPO_ROOT/.lfs-store"
MANIFEST="$REPO_ROOT/.lfs-manifest"
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
# --- Parse flags -----------------------------------------------------------------
MODE="data-only"
SKIP_PULL=false
for arg in "$@"; do
case "$arg" in
--data-only) MODE="data-only" ;;
--checkpoints-only) MODE="checkpoints-only" ;;
--all) MODE="all" ;;
--local) SKIP_PULL=true ;;
*) echo "Unknown flag: $arg" >&2; exit 1 ;;
esac
done
# --- Helpers ---------------------------------------------------------------------
scope_prefixes() {
case "$MODE" in
data-only) echo "data/" ;;
checkpoints-only) echo "checkpoints/" ;;
all) echo "data/"; echo "checkpoints/" ;;
esac
}
in_scope() {
local path="$1"
local prefix
while IFS= read -r prefix; do
[[ "$path" == "$prefix"* ]] && return 0
done < <(scope_prefixes)
return 1
}
# Build --include patterns for git lfs pull
lfs_include_patterns() {
local patterns=()
local prefix
while IFS= read -r prefix; do
patterns+=("--include=.lfs-store/${prefix}**")
done < <(scope_prefixes)
echo "${patterns[@]}"
}
# --- Main ------------------------------------------------------------------------
TMP_DIR=$(mktemp -d)
trap 'rm -rf "$TMP_DIR"' EXIT
echo "=== LFS pull (mode: $MODE) ==="
echo "Threads: $THREADS"
echo ""
# 1. Git pull (fetch manifest + LFS pointers)
if ! $SKIP_PULL; then
echo "Pulling from remote..."
cd "$REPO_ROOT"
git pull
echo ""
echo "Fetching LFS objects..."
# shellcheck disable=SC2046
git lfs pull $(lfs_include_patterns)
echo ""
fi
# 2. Verify manifest exists
if [[ ! -f "$MANIFEST" ]]; then
echo "Error: .lfs-manifest not found — run git pull first or check that data has been pushed" >&2
exit 1
fi
if [[ ! -d "$STORE_DIR" ]]; then
echo "Error: .lfs-store/ not found — run git lfs pull first" >&2
exit 1
fi
# 3. Read manifest, find files needing decompression
echo "Checking which files need decompression..."
TO_DECOMPRESS=()
while IFS=$'\t' read -r relpath hash size ts; do
[[ "$relpath" == "#"* || -z "$relpath" ]] && continue
in_scope "$relpath" || continue
zstfile="$STORE_DIR/${relpath}.zst"
if [[ ! -f "$zstfile" ]]; then
echo " warning: missing compressed file: ${relpath}.zst" >&2
continue
fi
dst="$REPO_ROOT/$relpath"
# Skip if destination exists and hash matches
if [[ -f "$dst" ]]; then
current_hash=$(xxh3sum "$dst" | awk '{print $1}')
current_hash="${current_hash#XXH3_}"
if [[ "$current_hash" == "$hash" ]]; then
continue
fi
fi
TO_DECOMPRESS+=("$relpath")
done < "$MANIFEST"
echo " ${#TO_DECOMPRESS[@]} files to decompress"
if [[ ${#TO_DECOMPRESS[@]} -eq 0 ]]; then
echo ""
echo "Everything up to date."
exit 0
fi
# 4. Decompress in parallel
echo ""
echo "Decompressing..."
printf '%s\n' "${TO_DECOMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c '
src="'"$STORE_DIR"'/{}.zst"
dst="'"$REPO_ROOT"'/{}"
mkdir -p "$(dirname "$dst")"
zstd -d -q --force "$src" -o "$dst"
echo " decompressed: {}"
'
echo ""
echo "=== Done ==="
echo "Decompressed ${#TO_DECOMPRESS[@]} files"