#!/usr/bin/env bash # Compress data/ and checkpoints/ → .lfs-store/, track via Git LFS. # Uses xxh3 content hashing for per-file change detection. # # Counterpart: scripts/data-pull.sh # # Usage: # ./scripts/data-push.sh # data only (default) # ./scripts/data-push.sh --checkpoints-only # checkpoints only # ./scripts/data-push.sh --all # data + checkpoints # ./scripts/data-push.sh --dry-run # show what would change # ./scripts/data-push.sh --no-push # commit but don't push set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" STORE_DIR="$REPO_ROOT/.lfs-store" MANIFEST="$REPO_ROOT/.lfs-manifest" THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) # --- Tracked data directories --------------------------------------------------- DATA_DIRS=( data/paragraphs data/annotations data/gold data/dapt-corpus data/analysis data/bench data/pilot ) # --- Checkpoint include/exclude ------------------------------------------------- # Only these filenames are kept from checkpoint trees CKPT_KEEP_NAMES=( model.safetensors config.json tokenizer.json tokenizer_config.json training_args.bin trainer_state.json ) # --- Parse flags ----------------------------------------------------------------- MODE="data-only" DRY_RUN=false NO_PUSH=false for arg in "$@"; do case "$arg" in --data-only) MODE="data-only" ;; --checkpoints-only) MODE="checkpoints-only" ;; --all) MODE="all" ;; --dry-run) DRY_RUN=true ;; --no-push) NO_PUSH=true ;; *) echo "Unknown flag: $arg" >&2; exit 1 ;; esac done # --- Helpers --------------------------------------------------------------------- # Collect source files for the current mode, one per line (absolute paths). collect_files() { if [[ "$MODE" != "checkpoints-only" ]]; then for dir in "${DATA_DIRS[@]}"; do local abs="$REPO_ROOT/$dir" [[ -d "$abs" ]] && find "$abs" -type f -not -name '*.zst' done fi if [[ "$MODE" != "data-only" ]]; then # Tracked checkpoint files (by name whitelist) local name_args=() for n in "${CKPT_KEEP_NAMES[@]}"; do name_args+=(-name "$n" -o) done # Remove trailing -o unset 'name_args[-1]' find "$REPO_ROOT/checkpoints" -type f \ -not -path "*/.data_cache/*" \ \( "${name_args[@]}" \) 2>/dev/null || true # Top-level log/json files in finetune/ find "$REPO_ROOT/checkpoints/finetune" -maxdepth 1 -type f 2>/dev/null || true fi } # Scope prefix(es) for the current mode — used to filter manifest entries. scope_prefixes() { case "$MODE" in data-only) echo "data/" ;; checkpoints-only) echo "checkpoints/" ;; all) echo "data/"; echo "checkpoints/" ;; esac } # Check if a path is in scope for the current mode. in_scope() { local path="$1" local prefix while IFS= read -r prefix; do [[ "$path" == "$prefix"* ]] && return 0 done < <(scope_prefixes) return 1 } # --- Main ------------------------------------------------------------------------ TMP_DIR=$(mktemp -d) trap 'rm -rf "$TMP_DIR"' EXIT echo "=== LFS push (mode: $MODE) ===" echo "Threads: $THREADS, zstd level: 19" echo "" # 1. Collect source files echo "Collecting source files..." collect_files > "$TMP_DIR/filelist.txt" FILE_COUNT=$(wc -l < "$TMP_DIR/filelist.txt") echo " found $FILE_COUNT files" if [[ "$FILE_COUNT" -eq 0 ]]; then echo "No files to process." exit 0 fi # 2. Hash all source files with xxh3 echo "Hashing with xxh3..." xxh3sum --filelist "$TMP_DIR/filelist.txt" > "$TMP_DIR/hashes.txt" echo " done" # 3. Load old manifest into associative arrays declare -A OLD_HASH OLD_SIZE if [[ -f "$MANIFEST" ]]; then while IFS=$'\t' read -r path hash size ts; do [[ "$path" == "#"* || -z "$path" ]] && continue OLD_HASH["$path"]="$hash" OLD_SIZE["$path"]="$size" done < "$MANIFEST" fi # 4. Diff: find changed, new, unchanged files declare -A NEW_HASH CHANGED=() NEW=() UNCHANGED=0 while IFS=' ' read -r hash filepath; do # xxh3sum prefixes hash with "XXH3_" — strip it hash="${hash#XXH3_}" relpath="${filepath#$REPO_ROOT/}" NEW_HASH["$relpath"]="$hash" old="${OLD_HASH[$relpath]:-}" if [[ -z "$old" ]]; then NEW+=("$relpath") elif [[ "$old" != "$hash" ]]; then CHANGED+=("$relpath") else UNCHANGED=$((UNCHANGED + 1)) fi done < "$TMP_DIR/hashes.txt" # 5. Find deletions (in old manifest + in scope, but not on disk) DELETED=() for path in "${!OLD_HASH[@]}"; do in_scope "$path" || continue [[ -z "${NEW_HASH[$path]:-}" ]] && DELETED+=("$path") done echo "" echo " new: ${#NEW[@]}" echo " changed: ${#CHANGED[@]}" echo " unchanged: $UNCHANGED" echo " deleted: ${#DELETED[@]}" TO_COMPRESS=("${NEW[@]}" "${CHANGED[@]}") if [[ ${#TO_COMPRESS[@]} -eq 0 && ${#DELETED[@]} -eq 0 ]]; then echo "" echo "Nothing to do." exit 0 fi if $DRY_RUN; then echo "" if [[ ${#TO_COMPRESS[@]} -gt 0 ]]; then echo "Would compress:" printf ' %s\n' "${TO_COMPRESS[@]}" fi if [[ ${#DELETED[@]} -gt 0 ]]; then echo "Would prune:" printf ' %s\n' "${DELETED[@]}" fi echo "" echo "(dry run — nothing written)" exit 0 fi # 6. Compress changed + new files in parallel if [[ ${#TO_COMPRESS[@]} -gt 0 ]]; then echo "" echo "Compressing ${#TO_COMPRESS[@]} files (zstd -19, $THREADS parallel)..." printf '%s\n' "${TO_COMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c ' src="'"$REPO_ROOT"'/{}" dst="'"$STORE_DIR"'/{}.zst" mkdir -p "$(dirname "$dst")" zstd -19 -q --force "$src" -o "$dst" echo " compressed: {}" ' fi # 7. Prune deleted files if [[ ${#DELETED[@]} -gt 0 ]]; then echo "" echo "Pruning ${#DELETED[@]} deleted files..." for relpath in "${DELETED[@]}"; do zstfile="$STORE_DIR/${relpath}.zst" if [[ -f "$zstfile" ]]; then rm "$zstfile" echo " pruned: ${relpath}.zst" fi done find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true fi # 8. Write manifest (merge: preserve out-of-scope entries) echo "" echo "Writing manifest..." { echo "# .lfs-manifest — path, xxh3, compressed_bytes, timestamp" echo "# Generated by scripts/data-push.sh — do not edit manually" # Preserve out-of-scope entries from old manifest if [[ -f "$MANIFEST" ]]; then while IFS=$'\t' read -r path hash size ts; do [[ "$path" == "#"* || -z "$path" ]] && continue in_scope "$path" && continue printf '%s\t%s\t%s\t%s\n' "$path" "$hash" "$size" "$ts" done < "$MANIFEST" fi # Write in-scope entries from current hashes for relpath in "${!NEW_HASH[@]}"; do in_scope "$relpath" || continue zstfile="$STORE_DIR/${relpath}.zst" if [[ -f "$zstfile" ]]; then size=$(stat -c%s "$zstfile" 2>/dev/null || stat -f%z "$zstfile") else size=0 fi printf '%s\t%s\t%s\t%s\n' "$relpath" "${NEW_HASH[$relpath]}" "$size" "$(date -Iseconds)" done } | grep -v '^$' | sort > "$TMP_DIR/manifest.new" mv "$TMP_DIR/manifest.new" "$MANIFEST" # 9. Git add, commit, push echo "" echo "=== Git commit ===" cd "$REPO_ROOT" git add .lfs-store/ .lfs-manifest if git diff --cached --quiet; then echo "No git changes to commit." exit 0 fi SUMMARY="" [[ ${#NEW[@]} -gt 0 ]] && SUMMARY+="${#NEW[@]} new, " [[ ${#CHANGED[@]} -gt 0 ]] && SUMMARY+="${#CHANGED[@]} changed, " [[ ${#DELETED[@]} -gt 0 ]] && SUMMARY+="${#DELETED[@]} deleted, " SUMMARY="${SUMMARY%, }" git commit -m "$(cat </dev/null; then git push else git push -u origin "$(git branch --show-current)" fi fi echo "" echo "=== Done ==="