SEC-cyBERT/scripts/data-push.sh
2026-04-06 15:52:55 -04:00

296 lines
7.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# Compress data/ and checkpoints/ → .lfs-store/, track via Git LFS.
# Uses xxh3 content hashing for per-file change detection.
#
# Counterpart: scripts/data-pull.sh
#
# Usage:
# ./scripts/data-push.sh # data only (default)
# ./scripts/data-push.sh --checkpoints-only # checkpoints only
# ./scripts/data-push.sh --all # data + checkpoints
# ./scripts/data-push.sh --dry-run # show what would change
# ./scripts/data-push.sh --no-push # commit but don't push
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
STORE_DIR="$REPO_ROOT/.lfs-store"
MANIFEST="$REPO_ROOT/.lfs-manifest"
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
# --- Tracked data directories ---------------------------------------------------
DATA_DIRS=(
data/paragraphs
data/annotations
data/gold
data/dapt-corpus
data/analysis
data/bench
data/pilot
)
# --- Checkpoint include/exclude -------------------------------------------------
# Only these filenames are kept from checkpoint trees
CKPT_KEEP_NAMES=(
model.safetensors
config.json
tokenizer.json
tokenizer_config.json
training_args.bin
trainer_state.json
)
# --- Parse flags -----------------------------------------------------------------
MODE="data-only"
DRY_RUN=false
NO_PUSH=false
for arg in "$@"; do
case "$arg" in
--data-only) MODE="data-only" ;;
--checkpoints-only) MODE="checkpoints-only" ;;
--all) MODE="all" ;;
--dry-run) DRY_RUN=true ;;
--no-push) NO_PUSH=true ;;
*) echo "Unknown flag: $arg" >&2; exit 1 ;;
esac
done
# --- Helpers ---------------------------------------------------------------------
# Collect source files for the current mode, one per line (absolute paths).
collect_files() {
if [[ "$MODE" != "checkpoints-only" ]]; then
for dir in "${DATA_DIRS[@]}"; do
local abs="$REPO_ROOT/$dir"
[[ -d "$abs" ]] && find "$abs" -type f \
-not -name '*.zst' \
-not -path '*/.finetune_data_cache/*' \
-not -path '*/.data_cache/*'
done
fi
if [[ "$MODE" != "data-only" ]]; then
# Tracked checkpoint files (by name whitelist)
local name_args=()
for n in "${CKPT_KEEP_NAMES[@]}"; do
name_args+=(-name "$n" -o)
done
# Remove trailing -o
unset 'name_args[-1]'
find "$REPO_ROOT/checkpoints" -type f \
-not -path "*/.data_cache/*" \
\( "${name_args[@]}" \) 2>/dev/null || true
# Top-level log/json files in finetune/
find "$REPO_ROOT/checkpoints/finetune" -maxdepth 1 -type f 2>/dev/null || true
fi
}
# Scope prefix(es) for the current mode — used to filter manifest entries.
scope_prefixes() {
case "$MODE" in
data-only) echo "data/" ;;
checkpoints-only) echo "checkpoints/" ;;
all) echo "data/"; echo "checkpoints/" ;;
esac
}
# Check if a path is in scope for the current mode.
in_scope() {
local path="$1"
local prefix
while IFS= read -r prefix; do
[[ "$path" == "$prefix"* ]] && return 0
done < <(scope_prefixes)
return 1
}
# --- Main ------------------------------------------------------------------------
TMP_DIR=$(mktemp -d)
trap 'rm -rf "$TMP_DIR"' EXIT
echo "=== LFS push (mode: $MODE) ==="
echo "Threads: $THREADS, zstd level: 19"
echo ""
# 1. Collect source files
echo "Collecting source files..."
collect_files > "$TMP_DIR/filelist.txt"
FILE_COUNT=$(wc -l < "$TMP_DIR/filelist.txt")
echo " found $FILE_COUNT files"
if [[ "$FILE_COUNT" -eq 0 ]]; then
echo "No files to process."
exit 0
fi
# 2. Hash all source files with xxh3
echo "Hashing with xxh3..."
xxh3sum --filelist "$TMP_DIR/filelist.txt" > "$TMP_DIR/hashes.txt"
echo " done"
# 3. Load old manifest into associative arrays
declare -A OLD_HASH OLD_SIZE
if [[ -f "$MANIFEST" ]]; then
while IFS=$'\t' read -r path hash size ts; do
[[ "$path" == "#"* || -z "$path" ]] && continue
OLD_HASH["$path"]="$hash"
OLD_SIZE["$path"]="$size"
done < "$MANIFEST"
fi
# 4. Diff: find changed, new, unchanged files
declare -A NEW_HASH
CHANGED=()
NEW=()
UNCHANGED=0
while IFS=' ' read -r hash filepath; do
# xxh3sum prefixes hash with "XXH3_" — strip it
hash="${hash#XXH3_}"
relpath="${filepath#$REPO_ROOT/}"
NEW_HASH["$relpath"]="$hash"
old="${OLD_HASH[$relpath]:-}"
if [[ -z "$old" ]]; then
NEW+=("$relpath")
elif [[ "$old" != "$hash" ]]; then
CHANGED+=("$relpath")
else
UNCHANGED=$((UNCHANGED + 1))
fi
done < "$TMP_DIR/hashes.txt"
# 5. Find deletions (in old manifest + in scope, but not on disk)
DELETED=()
for path in "${!OLD_HASH[@]}"; do
in_scope "$path" || continue
[[ -z "${NEW_HASH[$path]:-}" ]] && DELETED+=("$path")
done
echo ""
echo " new: ${#NEW[@]}"
echo " changed: ${#CHANGED[@]}"
echo " unchanged: $UNCHANGED"
echo " deleted: ${#DELETED[@]}"
TO_COMPRESS=("${NEW[@]}" "${CHANGED[@]}")
if [[ ${#TO_COMPRESS[@]} -eq 0 && ${#DELETED[@]} -eq 0 ]]; then
echo ""
echo "Nothing to do."
exit 0
fi
if $DRY_RUN; then
echo ""
if [[ ${#TO_COMPRESS[@]} -gt 0 ]]; then
echo "Would compress:"
printf ' %s\n' "${TO_COMPRESS[@]}"
fi
if [[ ${#DELETED[@]} -gt 0 ]]; then
echo "Would prune:"
printf ' %s\n' "${DELETED[@]}"
fi
echo ""
echo "(dry run — nothing written)"
exit 0
fi
# 6. Compress changed + new files in parallel
if [[ ${#TO_COMPRESS[@]} -gt 0 ]]; then
echo ""
echo "Compressing ${#TO_COMPRESS[@]} files (zstd -19, $THREADS parallel)..."
printf '%s\n' "${TO_COMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c '
src="'"$REPO_ROOT"'/{}"
dst="'"$STORE_DIR"'/{}.zst"
mkdir -p "$(dirname "$dst")"
zstd -19 -q --force "$src" -o "$dst"
echo " compressed: {}"
'
fi
# 7. Prune deleted files
if [[ ${#DELETED[@]} -gt 0 ]]; then
echo ""
echo "Pruning ${#DELETED[@]} deleted files..."
for relpath in "${DELETED[@]}"; do
zstfile="$STORE_DIR/${relpath}.zst"
if [[ -f "$zstfile" ]]; then
rm "$zstfile"
echo " pruned: ${relpath}.zst"
fi
done
find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true
fi
# 8. Write manifest (merge: preserve out-of-scope entries)
echo ""
echo "Writing manifest..."
{
echo "# .lfs-manifest — path, xxh3, compressed_bytes, timestamp"
echo "# Generated by scripts/data-push.sh — do not edit manually"
# Preserve out-of-scope entries from old manifest
if [[ -f "$MANIFEST" ]]; then
while IFS=$'\t' read -r path hash size ts; do
[[ "$path" == "#"* || -z "$path" ]] && continue
in_scope "$path" && continue
printf '%s\t%s\t%s\t%s\n' "$path" "$hash" "$size" "$ts"
done < "$MANIFEST"
fi
# Write in-scope entries from current hashes
for relpath in "${!NEW_HASH[@]}"; do
in_scope "$relpath" || continue
zstfile="$STORE_DIR/${relpath}.zst"
if [[ -f "$zstfile" ]]; then
size=$(stat -c%s "$zstfile" 2>/dev/null || stat -f%z "$zstfile")
else
size=0
fi
printf '%s\t%s\t%s\t%s\n' "$relpath" "${NEW_HASH[$relpath]}" "$size" "$(date -Iseconds)"
done
} | grep -v '^$' | sort > "$TMP_DIR/manifest.new"
mv "$TMP_DIR/manifest.new" "$MANIFEST"
# 9. Git add, commit, push
echo ""
echo "=== Git commit ==="
cd "$REPO_ROOT"
git add .lfs-store/ .lfs-manifest
if git diff --cached --quiet; then
echo "No git changes to commit."
exit 0
fi
SUMMARY=""
[[ ${#NEW[@]} -gt 0 ]] && SUMMARY+="${#NEW[@]} new, "
[[ ${#CHANGED[@]} -gt 0 ]] && SUMMARY+="${#CHANGED[@]} changed, "
[[ ${#DELETED[@]} -gt 0 ]] && SUMMARY+="${#DELETED[@]} deleted, "
SUMMARY="${SUMMARY%, }"
git commit -m "$(cat <<EOF
data: ${SUMMARY}
EOF
)"
if $NO_PUSH; then
echo "Committed. Skipping push (--no-push)."
else
echo ""
echo "=== Git push ==="
if git rev-parse --verify "@{u}" &>/dev/null; then
git push
else
git push -u origin "$(git branch --show-current)"
fi
fi
echo ""
echo "=== Done ==="