infra: migrate from DVC to Git LFS with xxh3 change detection
Replace DVC pipeline with Git LFS on self-hosted Gitea. New scripts use per-file xxh3 hashing for change detection and parallel zstd-19 compression. Supports separate data/checkpoint push modes.
This commit is contained in:
parent
2e932bc327
commit
a5f06f2db7
@ -1,6 +0,0 @@
|
||||
outs:
|
||||
- md5: b52c8929353b5ed374f10aab8c4e7837.dir
|
||||
size: 753948666
|
||||
nfiles: 234
|
||||
hash: md5
|
||||
path: .dvc-store
|
||||
3
.dvc/.gitignore
vendored
3
.dvc/.gitignore
vendored
@ -1,3 +0,0 @@
|
||||
/config.local
|
||||
/tmp
|
||||
/cache
|
||||
@ -1,9 +0,0 @@
|
||||
[core]
|
||||
analytics = false
|
||||
remote = r2
|
||||
['remote "r2"']
|
||||
url = s3://share/sec-cybert
|
||||
endpointurl = https://0a665ba1f35a38354b3f623be13f14bd.r2.cloudflarestorage.com
|
||||
region = auto
|
||||
['remote "public"']
|
||||
url = https://share.lightningcode.dev/sec-cybert
|
||||
@ -1,3 +0,0 @@
|
||||
# Add patterns of files dvc should ignore, which could improve
|
||||
# the performance. Learn more at
|
||||
# https://dvc.org/doc/user-guide/dvcignore
|
||||
2
.gitattributes
vendored
Normal file
2
.gitattributes
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# LFS-tracked compressed store
|
||||
.lfs-store/**/*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,8 +1,7 @@
|
||||
# Data (too large for git — managed by DVC)
|
||||
# Data (working copies — compressed copies tracked via Git LFS in .lfs-store/)
|
||||
data/
|
||||
models/
|
||||
checkpoints/
|
||||
.dvc-store/
|
||||
*.tar.zst
|
||||
|
||||
# Dependencies
|
||||
@ -55,5 +54,4 @@ unsloth_compiled_cache/
|
||||
# Finder (MacOS) folder config
|
||||
.DS_Store
|
||||
python/*.whl
|
||||
/.dvc-store
|
||||
|
||||
|
||||
14
CLAUDE.md
14
CLAUDE.md
@ -58,14 +58,22 @@ All commands run from repo root via `bun run <script>`. No need to cd into subpa
|
||||
|--------|-------------|
|
||||
| `py:train` | CLI entrypoint (`uv run main.py` — pass subcommand as arg, e.g. `bun run py:train dapt --config ...`) |
|
||||
|
||||
### Data management (`data:*`)
|
||||
### Data & checkpoints (`data:*`, `ckpt:*`, `sync:*`)
|
||||
|
||||
Large files are compressed with zstd-19 into `.lfs-store/` and tracked via Git LFS on Gitea (`git.claiborne.soy`). Per-file xxh3 hashing ensures only changed files are recompressed/uploaded.
|
||||
|
||||
| Script | What it does |
|
||||
|--------|-------------|
|
||||
| `data:push` | Compress `data/` → `.dvc-store/`, DVC add + push to R2 |
|
||||
| `data:pull` | DVC pull from R2 + decompress into `data/` |
|
||||
| `data:push` | Compress `data/` → `.lfs-store/`, commit + push via LFS (default) |
|
||||
| `data:pull` | Git LFS pull + decompress into `data/` |
|
||||
| `ckpt:push` | Compress checkpoints → `.lfs-store/`, commit + push via LFS |
|
||||
| `ckpt:pull` | Git LFS pull + decompress checkpoints |
|
||||
| `sync:push` | Push both data + checkpoints |
|
||||
| `sync:pull` | Pull both data + checkpoints |
|
||||
| `data:package` | Build standalone `.tar.zst` archives for submission |
|
||||
|
||||
Checkpoint sync excludes `optimizer.pt`, `scheduler.pt`, `rng_state.pth`, and `.data_cache/` (resume-only state, regenerable).
|
||||
|
||||
### Cross-package
|
||||
|
||||
| Script | What it does |
|
||||
|
||||
@ -23,7 +23,11 @@
|
||||
"typecheck": "bun run --filter '*' typecheck",
|
||||
"data:push": "./scripts/data-push.sh",
|
||||
"data:pull": "./scripts/data-pull.sh",
|
||||
"data:package": "./scripts/package-data.sh"
|
||||
"data:package": "./scripts/package-data.sh",
|
||||
"ckpt:push": "./scripts/data-push.sh --checkpoints-only",
|
||||
"ckpt:pull": "./scripts/data-pull.sh --checkpoints-only",
|
||||
"sync:push": "./scripts/data-push.sh --all",
|
||||
"sync:pull": "./scripts/data-pull.sh --all"
|
||||
},
|
||||
"workspaces": [
|
||||
"packages/*",
|
||||
|
||||
@ -1,57 +1,145 @@
|
||||
#!/usr/bin/env bash
|
||||
# DVC pull → decompress .dvc-store/ back into data/.
|
||||
# Pull LFS objects and decompress .lfs-store/ → data/ and checkpoints/.
|
||||
# Uses xxh3 manifest for skip logic — only decompresses files that changed.
|
||||
#
|
||||
# Counterpart: scripts/data-push.sh
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/data-pull.sh # pull + decompress all
|
||||
# ./scripts/data-pull.sh --local # decompress only (skip dvc pull, use existing cache)
|
||||
# ./scripts/data-pull.sh # data only (default)
|
||||
# ./scripts/data-pull.sh --checkpoints-only # checkpoints only
|
||||
# ./scripts/data-pull.sh --all # data + checkpoints
|
||||
# ./scripts/data-pull.sh --local # decompress only (skip git pull)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
DATA_DIR="$REPO_ROOT/data"
|
||||
STORE_DIR="$REPO_ROOT/.dvc-store"
|
||||
SKIP_PULL=false
|
||||
[[ "${1:-}" == "--local" ]] && SKIP_PULL=true
|
||||
|
||||
STORE_DIR="$REPO_ROOT/.lfs-store"
|
||||
MANIFEST="$REPO_ROOT/.lfs-manifest"
|
||||
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
|
||||
if ! $SKIP_PULL; then
|
||||
echo "=== DVC pull ==="
|
||||
cd "$REPO_ROOT"
|
||||
# Pull from public HTTP remote (no credentials needed)
|
||||
uvx --with 'dvc[s3]' dvc pull -r public
|
||||
echo ""
|
||||
fi
|
||||
# --- Parse flags -----------------------------------------------------------------
|
||||
MODE="data-only"
|
||||
SKIP_PULL=false
|
||||
|
||||
if [[ ! -d "$STORE_DIR" ]]; then
|
||||
echo "Error: .dvc-store/ not found — run dvc pull first or check .dvc-store.dvc exists" >&2
|
||||
exit 1
|
||||
fi
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--data-only) MODE="data-only" ;;
|
||||
--checkpoints-only) MODE="checkpoints-only" ;;
|
||||
--all) MODE="all" ;;
|
||||
--local) SKIP_PULL=true ;;
|
||||
*) echo "Unknown flag: $arg" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "=== Decompressing .dvc-store/ → data/ ==="
|
||||
# --- Helpers ---------------------------------------------------------------------
|
||||
|
||||
scope_prefixes() {
|
||||
case "$MODE" in
|
||||
data-only) echo "data/" ;;
|
||||
checkpoints-only) echo "checkpoints/" ;;
|
||||
all) echo "data/"; echo "checkpoints/" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
in_scope() {
|
||||
local path="$1"
|
||||
local prefix
|
||||
while IFS= read -r prefix; do
|
||||
[[ "$path" == "$prefix"* ]] && return 0
|
||||
done < <(scope_prefixes)
|
||||
return 1
|
||||
}
|
||||
|
||||
# Build --include patterns for git lfs pull
|
||||
lfs_include_patterns() {
|
||||
local patterns=()
|
||||
local prefix
|
||||
while IFS= read -r prefix; do
|
||||
patterns+=("--include=.lfs-store/${prefix}**")
|
||||
done < <(scope_prefixes)
|
||||
echo "${patterns[@]}"
|
||||
}
|
||||
|
||||
# --- Main ------------------------------------------------------------------------
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP_DIR"' EXIT
|
||||
|
||||
echo "=== LFS pull (mode: $MODE) ==="
|
||||
echo "Threads: $THREADS"
|
||||
echo ""
|
||||
|
||||
count=0
|
||||
# 1. Git pull (fetch manifest + LFS pointers)
|
||||
if ! $SKIP_PULL; then
|
||||
echo "Pulling from remote..."
|
||||
cd "$REPO_ROOT"
|
||||
git pull
|
||||
echo ""
|
||||
|
||||
while IFS= read -r -d '' zstfile; do
|
||||
relpath="${zstfile#$STORE_DIR/}"
|
||||
relpath="${relpath%.zst}" # strip .zst to get original relative path
|
||||
dstfile="$DATA_DIR/$relpath"
|
||||
dstdir="$(dirname "$dstfile")"
|
||||
echo "Fetching LFS objects..."
|
||||
# shellcheck disable=SC2046
|
||||
git lfs pull $(lfs_include_patterns)
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Skip if destination exists and is newer than compressed source
|
||||
if [[ -f "$dstfile" && "$dstfile" -nt "$zstfile" ]]; then
|
||||
# 2. Verify manifest exists
|
||||
if [[ ! -f "$MANIFEST" ]]; then
|
||||
echo "Error: .lfs-manifest not found — run git pull first or check that data has been pushed" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -d "$STORE_DIR" ]]; then
|
||||
echo "Error: .lfs-store/ not found — run git lfs pull first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Read manifest, find files needing decompression
|
||||
echo "Checking which files need decompression..."
|
||||
TO_DECOMPRESS=()
|
||||
|
||||
while IFS=$'\t' read -r relpath hash size ts; do
|
||||
[[ "$relpath" == "#"* || -z "$relpath" ]] && continue
|
||||
in_scope "$relpath" || continue
|
||||
|
||||
zstfile="$STORE_DIR/${relpath}.zst"
|
||||
if [[ ! -f "$zstfile" ]]; then
|
||||
echo " warning: missing compressed file: ${relpath}.zst" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
mkdir -p "$dstdir"
|
||||
zstd -d -T"$THREADS" -q --force "$zstfile" -o "$dstfile"
|
||||
count=$((count + 1))
|
||||
done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
|
||||
dst="$REPO_ROOT/$relpath"
|
||||
|
||||
# Skip if destination exists and hash matches
|
||||
if [[ -f "$dst" ]]; then
|
||||
current_hash=$(xxh3sum "$dst" | awk '{print $1}')
|
||||
current_hash="${current_hash#XXH3_}"
|
||||
if [[ "$current_hash" == "$hash" ]]; then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
TO_DECOMPRESS+=("$relpath")
|
||||
done < "$MANIFEST"
|
||||
|
||||
echo " ${#TO_DECOMPRESS[@]} files to decompress"
|
||||
|
||||
if [[ ${#TO_DECOMPRESS[@]} -eq 0 ]]; then
|
||||
echo ""
|
||||
echo "Everything up to date."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 4. Decompress in parallel
|
||||
echo ""
|
||||
echo "Decompressing..."
|
||||
printf '%s\n' "${TO_DECOMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c '
|
||||
src="'"$STORE_DIR"'/{}.zst"
|
||||
dst="'"$REPO_ROOT"'/{}"
|
||||
mkdir -p "$(dirname "$dst")"
|
||||
zstd -d -q --force "$src" -o "$dst"
|
||||
echo " decompressed: {}"
|
||||
'
|
||||
|
||||
echo "Decompressed $count files into data/"
|
||||
echo ""
|
||||
echo "=== Done ==="
|
||||
echo "Decompressed ${#TO_DECOMPRESS[@]} files"
|
||||
|
||||
@ -1,119 +1,292 @@
|
||||
#!/usr/bin/env bash
|
||||
# Compress data/ → .dvc-store/, then DVC add + push.
|
||||
# Compress data/ and checkpoints/ → .lfs-store/, track via Git LFS.
|
||||
# Uses xxh3 content hashing for per-file change detection.
|
||||
#
|
||||
# Working files in data/ stay untouched. DVC tracks compressed copies.
|
||||
# Counterpart: scripts/data-pull.sh
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/data-push.sh # compress, add, push
|
||||
# ./scripts/data-push.sh --dry-run # show what would be compressed
|
||||
# ./scripts/data-push.sh # data only (default)
|
||||
# ./scripts/data-push.sh --checkpoints-only # checkpoints only
|
||||
# ./scripts/data-push.sh --all # data + checkpoints
|
||||
# ./scripts/data-push.sh --dry-run # show what would change
|
||||
# ./scripts/data-push.sh --no-push # commit but don't push
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
DATA_DIR="$REPO_ROOT/data"
|
||||
STORE_DIR="$REPO_ROOT/.dvc-store"
|
||||
DRY_RUN=false
|
||||
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
|
||||
|
||||
STORE_DIR="$REPO_ROOT/.lfs-store"
|
||||
MANIFEST="$REPO_ROOT/.lfs-manifest"
|
||||
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
|
||||
# Directories to track (everything except raw HTML, bulk metadata, and empty placeholders)
|
||||
TRACK_DIRS=(
|
||||
paragraphs
|
||||
annotations
|
||||
gold
|
||||
dapt-corpus
|
||||
analysis
|
||||
bench
|
||||
pilot
|
||||
# --- Tracked data directories ---------------------------------------------------
|
||||
DATA_DIRS=(
|
||||
data/paragraphs
|
||||
data/annotations
|
||||
data/gold
|
||||
data/dapt-corpus
|
||||
data/analysis
|
||||
data/bench
|
||||
data/pilot
|
||||
)
|
||||
|
||||
compress_dir() {
|
||||
local reldir="$1"
|
||||
local srcdir="$DATA_DIR/$reldir"
|
||||
local dstdir="$STORE_DIR/$reldir"
|
||||
# --- Checkpoint include/exclude -------------------------------------------------
|
||||
# Only these filenames are kept from checkpoint trees
|
||||
CKPT_KEEP_NAMES=(
|
||||
model.safetensors
|
||||
config.json
|
||||
tokenizer.json
|
||||
tokenizer_config.json
|
||||
training_args.bin
|
||||
trainer_state.json
|
||||
)
|
||||
|
||||
if [[ ! -d "$srcdir" ]]; then
|
||||
echo " skip $reldir/ (not found)"
|
||||
return
|
||||
# --- Parse flags -----------------------------------------------------------------
|
||||
MODE="data-only"
|
||||
DRY_RUN=false
|
||||
NO_PUSH=false
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--data-only) MODE="data-only" ;;
|
||||
--checkpoints-only) MODE="checkpoints-only" ;;
|
||||
--all) MODE="all" ;;
|
||||
--dry-run) DRY_RUN=true ;;
|
||||
--no-push) NO_PUSH=true ;;
|
||||
*) echo "Unknown flag: $arg" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- Helpers ---------------------------------------------------------------------
|
||||
|
||||
# Collect source files for the current mode, one per line (absolute paths).
|
||||
collect_files() {
|
||||
if [[ "$MODE" != "checkpoints-only" ]]; then
|
||||
for dir in "${DATA_DIRS[@]}"; do
|
||||
local abs="$REPO_ROOT/$dir"
|
||||
[[ -d "$abs" ]] && find "$abs" -type f -not -name '*.zst'
|
||||
done
|
||||
fi
|
||||
|
||||
# Find all files (not dirs) in the source
|
||||
while IFS= read -r -d '' srcfile; do
|
||||
local relpath="${srcfile#$DATA_DIR/}"
|
||||
local dstfile="$STORE_DIR/${relpath}.zst"
|
||||
local dstdir_for_file="$(dirname "$dstfile")"
|
||||
if [[ "$MODE" != "data-only" ]]; then
|
||||
# Tracked checkpoint files (by name whitelist)
|
||||
local name_args=()
|
||||
for n in "${CKPT_KEEP_NAMES[@]}"; do
|
||||
name_args+=(-name "$n" -o)
|
||||
done
|
||||
# Remove trailing -o
|
||||
unset 'name_args[-1]'
|
||||
|
||||
# Skip if compressed version exists and is newer than source
|
||||
if [[ -f "$dstfile" && "$dstfile" -nt "$srcfile" ]]; then
|
||||
continue
|
||||
fi
|
||||
find "$REPO_ROOT/checkpoints" -type f \
|
||||
-not -path "*/.data_cache/*" \
|
||||
\( "${name_args[@]}" \) 2>/dev/null || true
|
||||
|
||||
if $DRY_RUN; then
|
||||
local srcsize=$(stat -c%s "$srcfile" 2>/dev/null || stat -f%z "$srcfile")
|
||||
echo " would compress: $relpath ($(numfmt --to=iec "$srcsize" 2>/dev/null || echo "${srcsize}B"))"
|
||||
else
|
||||
mkdir -p "$dstdir_for_file"
|
||||
zstd -19 -T"$THREADS" -q --force "$srcfile" -o "$dstfile"
|
||||
fi
|
||||
done < <(find "$srcdir" -type f -not -name '*.zst' -print0)
|
||||
}
|
||||
|
||||
# Remove stale compressed files whose source no longer exists
|
||||
prune_stale() {
|
||||
if [[ ! -d "$STORE_DIR" ]]; then return; fi
|
||||
|
||||
while IFS= read -r -d '' zstfile; do
|
||||
local relpath="${zstfile#$STORE_DIR/}"
|
||||
relpath="${relpath%.zst}" # strip .zst suffix to get original path
|
||||
local srcfile="$DATA_DIR/$relpath"
|
||||
|
||||
if [[ ! -f "$srcfile" ]]; then
|
||||
if $DRY_RUN; then
|
||||
echo " would prune: $relpath.zst (source deleted)"
|
||||
else
|
||||
rm "$zstfile"
|
||||
echo " pruned: $relpath.zst"
|
||||
fi
|
||||
fi
|
||||
done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
|
||||
|
||||
# Remove empty directories
|
||||
if ! $DRY_RUN; then
|
||||
find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true
|
||||
# Top-level log/json files in finetune/
|
||||
find "$REPO_ROOT/checkpoints/finetune" -maxdepth 1 -type f 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== Compressing data/ → .dvc-store/ ==="
|
||||
# Scope prefix(es) for the current mode — used to filter manifest entries.
|
||||
scope_prefixes() {
|
||||
case "$MODE" in
|
||||
data-only) echo "data/" ;;
|
||||
checkpoints-only) echo "checkpoints/" ;;
|
||||
all) echo "data/"; echo "checkpoints/" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Check if a path is in scope for the current mode.
|
||||
in_scope() {
|
||||
local path="$1"
|
||||
local prefix
|
||||
while IFS= read -r prefix; do
|
||||
[[ "$path" == "$prefix"* ]] && return 0
|
||||
done < <(scope_prefixes)
|
||||
return 1
|
||||
}
|
||||
|
||||
# --- Main ------------------------------------------------------------------------
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP_DIR"' EXIT
|
||||
|
||||
echo "=== LFS push (mode: $MODE) ==="
|
||||
echo "Threads: $THREADS, zstd level: 19"
|
||||
echo ""
|
||||
|
||||
for dir in "${TRACK_DIRS[@]}"; do
|
||||
echo "[$dir/]"
|
||||
compress_dir "$dir"
|
||||
# 1. Collect source files
|
||||
echo "Collecting source files..."
|
||||
collect_files > "$TMP_DIR/filelist.txt"
|
||||
FILE_COUNT=$(wc -l < "$TMP_DIR/filelist.txt")
|
||||
echo " found $FILE_COUNT files"
|
||||
|
||||
if [[ "$FILE_COUNT" -eq 0 ]]; then
|
||||
echo "No files to process."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 2. Hash all source files with xxh3
|
||||
echo "Hashing with xxh3..."
|
||||
xxh3sum --filelist "$TMP_DIR/filelist.txt" > "$TMP_DIR/hashes.txt"
|
||||
echo " done"
|
||||
|
||||
# 3. Load old manifest into associative arrays
|
||||
declare -A OLD_HASH OLD_SIZE
|
||||
if [[ -f "$MANIFEST" ]]; then
|
||||
while IFS=$'\t' read -r path hash size ts; do
|
||||
[[ "$path" == "#"* || -z "$path" ]] && continue
|
||||
OLD_HASH["$path"]="$hash"
|
||||
OLD_SIZE["$path"]="$size"
|
||||
done < "$MANIFEST"
|
||||
fi
|
||||
|
||||
# 4. Diff: find changed, new, unchanged files
|
||||
declare -A NEW_HASH
|
||||
CHANGED=()
|
||||
NEW=()
|
||||
UNCHANGED=0
|
||||
|
||||
while IFS=' ' read -r hash filepath; do
|
||||
# xxh3sum prefixes hash with "XXH3_" — strip it
|
||||
hash="${hash#XXH3_}"
|
||||
relpath="${filepath#$REPO_ROOT/}"
|
||||
NEW_HASH["$relpath"]="$hash"
|
||||
|
||||
old="${OLD_HASH[$relpath]:-}"
|
||||
if [[ -z "$old" ]]; then
|
||||
NEW+=("$relpath")
|
||||
elif [[ "$old" != "$hash" ]]; then
|
||||
CHANGED+=("$relpath")
|
||||
else
|
||||
UNCHANGED=$((UNCHANGED + 1))
|
||||
fi
|
||||
done < "$TMP_DIR/hashes.txt"
|
||||
|
||||
# 5. Find deletions (in old manifest + in scope, but not on disk)
|
||||
DELETED=()
|
||||
for path in "${!OLD_HASH[@]}"; do
|
||||
in_scope "$path" || continue
|
||||
[[ -z "${NEW_HASH[$path]:-}" ]] && DELETED+=("$path")
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Pruning stale files..."
|
||||
prune_stale
|
||||
echo " new: ${#NEW[@]}"
|
||||
echo " changed: ${#CHANGED[@]}"
|
||||
echo " unchanged: $UNCHANGED"
|
||||
echo " deleted: ${#DELETED[@]}"
|
||||
|
||||
TO_COMPRESS=("${NEW[@]}" "${CHANGED[@]}")
|
||||
|
||||
if [[ ${#TO_COMPRESS[@]} -eq 0 && ${#DELETED[@]} -eq 0 ]]; then
|
||||
echo ""
|
||||
echo "Nothing to do."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if $DRY_RUN; then
|
||||
echo ""
|
||||
if [[ ${#TO_COMPRESS[@]} -gt 0 ]]; then
|
||||
echo "Would compress:"
|
||||
printf ' %s\n' "${TO_COMPRESS[@]}"
|
||||
fi
|
||||
if [[ ${#DELETED[@]} -gt 0 ]]; then
|
||||
echo "Would prune:"
|
||||
printf ' %s\n' "${DELETED[@]}"
|
||||
fi
|
||||
echo ""
|
||||
echo "(dry run — nothing written)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== DVC add + push ==="
|
||||
# 6. Compress changed + new files in parallel
|
||||
if [[ ${#TO_COMPRESS[@]} -gt 0 ]]; then
|
||||
echo ""
|
||||
echo "Compressing ${#TO_COMPRESS[@]} files (zstd -19, $THREADS parallel)..."
|
||||
printf '%s\n' "${TO_COMPRESS[@]}" | xargs -P "$THREADS" -I{} bash -c '
|
||||
src="'"$REPO_ROOT"'/{}"
|
||||
dst="'"$STORE_DIR"'/{}.zst"
|
||||
mkdir -p "$(dirname "$dst")"
|
||||
zstd -19 -q --force "$src" -o "$dst"
|
||||
echo " compressed: {}"
|
||||
'
|
||||
fi
|
||||
|
||||
# 7. Prune deleted files
|
||||
if [[ ${#DELETED[@]} -gt 0 ]]; then
|
||||
echo ""
|
||||
echo "Pruning ${#DELETED[@]} deleted files..."
|
||||
for relpath in "${DELETED[@]}"; do
|
||||
zstfile="$STORE_DIR/${relpath}.zst"
|
||||
if [[ -f "$zstfile" ]]; then
|
||||
rm "$zstfile"
|
||||
echo " pruned: ${relpath}.zst"
|
||||
fi
|
||||
done
|
||||
find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# 8. Write manifest (merge: preserve out-of-scope entries)
|
||||
echo ""
|
||||
echo "Writing manifest..."
|
||||
{
|
||||
echo "# .lfs-manifest — path, xxh3, compressed_bytes, timestamp"
|
||||
echo "# Generated by scripts/data-push.sh — do not edit manually"
|
||||
|
||||
# Preserve out-of-scope entries from old manifest
|
||||
if [[ -f "$MANIFEST" ]]; then
|
||||
while IFS=$'\t' read -r path hash size ts; do
|
||||
[[ "$path" == "#"* || -z "$path" ]] && continue
|
||||
in_scope "$path" && continue
|
||||
printf '%s\t%s\t%s\t%s\n' "$path" "$hash" "$size" "$ts"
|
||||
done < "$MANIFEST"
|
||||
fi
|
||||
|
||||
# Write in-scope entries from current hashes
|
||||
for relpath in "${!NEW_HASH[@]}"; do
|
||||
in_scope "$relpath" || continue
|
||||
zstfile="$STORE_DIR/${relpath}.zst"
|
||||
if [[ -f "$zstfile" ]]; then
|
||||
size=$(stat -c%s "$zstfile" 2>/dev/null || stat -f%z "$zstfile")
|
||||
else
|
||||
size=0
|
||||
fi
|
||||
printf '%s\t%s\t%s\t%s\n' "$relpath" "${NEW_HASH[$relpath]}" "$size" "$(date -Iseconds)"
|
||||
done
|
||||
} | grep -v '^$' | sort > "$TMP_DIR/manifest.new"
|
||||
mv "$TMP_DIR/manifest.new" "$MANIFEST"
|
||||
|
||||
# 9. Git add, commit, push
|
||||
echo ""
|
||||
echo "=== Git commit ==="
|
||||
cd "$REPO_ROOT"
|
||||
uvx --with 'dvc[s3]' dvc add .dvc-store/
|
||||
git add .lfs-store/ .lfs-manifest
|
||||
|
||||
echo ""
|
||||
uvx --with 'dvc[s3]' dvc push
|
||||
if git diff --cached --quiet; then
|
||||
echo "No git changes to commit."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
SUMMARY=""
|
||||
[[ ${#NEW[@]} -gt 0 ]] && SUMMARY+="${#NEW[@]} new, "
|
||||
[[ ${#CHANGED[@]} -gt 0 ]] && SUMMARY+="${#CHANGED[@]} changed, "
|
||||
[[ ${#DELETED[@]} -gt 0 ]] && SUMMARY+="${#DELETED[@]} deleted, "
|
||||
SUMMARY="${SUMMARY%, }"
|
||||
|
||||
git commit -m "$(cat <<EOF
|
||||
data: ${SUMMARY}
|
||||
EOF
|
||||
)"
|
||||
|
||||
if $NO_PUSH; then
|
||||
echo "Committed. Skipping push (--no-push)."
|
||||
else
|
||||
echo ""
|
||||
echo "=== Git push ==="
|
||||
if git rev-parse --verify "@{u}" &>/dev/null; then
|
||||
git push
|
||||
else
|
||||
git push -u origin "$(git branch --show-current)"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Done ==="
|
||||
echo "Commit .dvc-store.dvc and .gitignore if changed:"
|
||||
echo " git add .dvc-store.dvc .gitignore && git commit -m 'data: update dvc-tracked data'"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user