120 lines
3.0 KiB
Bash
Executable File
120 lines
3.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Compress data/ → .dvc-store/, then DVC add + push.
|
|
#
|
|
# Working files in data/ stay untouched. DVC tracks compressed copies.
|
|
# Counterpart: scripts/data-pull.sh
|
|
#
|
|
# Usage:
|
|
# ./scripts/data-push.sh # compress, add, push
|
|
# ./scripts/data-push.sh --dry-run # show what would be compressed
|
|
|
|
set -euo pipefail
|
|
|
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
DATA_DIR="$REPO_ROOT/data"
|
|
STORE_DIR="$REPO_ROOT/.dvc-store"
|
|
DRY_RUN=false
|
|
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
|
|
|
|
THREADS=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
|
|
|
# Directories to track (everything except raw HTML, bulk metadata, and empty placeholders)
|
|
TRACK_DIRS=(
|
|
paragraphs
|
|
annotations
|
|
gold
|
|
dapt-corpus
|
|
analysis
|
|
bench
|
|
pilot
|
|
)
|
|
|
|
compress_dir() {
|
|
local reldir="$1"
|
|
local srcdir="$DATA_DIR/$reldir"
|
|
local dstdir="$STORE_DIR/$reldir"
|
|
|
|
if [[ ! -d "$srcdir" ]]; then
|
|
echo " skip $reldir/ (not found)"
|
|
return
|
|
fi
|
|
|
|
# Find all files (not dirs) in the source
|
|
while IFS= read -r -d '' srcfile; do
|
|
local relpath="${srcfile#$DATA_DIR/}"
|
|
local dstfile="$STORE_DIR/${relpath}.zst"
|
|
local dstdir_for_file="$(dirname "$dstfile")"
|
|
|
|
# Skip if compressed version exists and is newer than source
|
|
if [[ -f "$dstfile" && "$dstfile" -nt "$srcfile" ]]; then
|
|
continue
|
|
fi
|
|
|
|
if $DRY_RUN; then
|
|
local srcsize=$(stat -c%s "$srcfile" 2>/dev/null || stat -f%z "$srcfile")
|
|
echo " would compress: $relpath ($(numfmt --to=iec "$srcsize" 2>/dev/null || echo "${srcsize}B"))"
|
|
else
|
|
mkdir -p "$dstdir_for_file"
|
|
zstd -19 -T"$THREADS" -q --force "$srcfile" -o "$dstfile"
|
|
fi
|
|
done < <(find "$srcdir" -type f -not -name '*.zst' -print0)
|
|
}
|
|
|
|
# Remove stale compressed files whose source no longer exists
|
|
prune_stale() {
|
|
if [[ ! -d "$STORE_DIR" ]]; then return; fi
|
|
|
|
while IFS= read -r -d '' zstfile; do
|
|
local relpath="${zstfile#$STORE_DIR/}"
|
|
relpath="${relpath%.zst}" # strip .zst suffix to get original path
|
|
local srcfile="$DATA_DIR/$relpath"
|
|
|
|
if [[ ! -f "$srcfile" ]]; then
|
|
if $DRY_RUN; then
|
|
echo " would prune: $relpath.zst (source deleted)"
|
|
else
|
|
rm "$zstfile"
|
|
echo " pruned: $relpath.zst"
|
|
fi
|
|
fi
|
|
done < <(find "$STORE_DIR" -name '*.zst' -type f -print0)
|
|
|
|
# Remove empty directories
|
|
if ! $DRY_RUN; then
|
|
find "$STORE_DIR" -type d -empty -delete 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
echo "=== Compressing data/ → .dvc-store/ ==="
|
|
echo "Threads: $THREADS, zstd level: 19"
|
|
echo ""
|
|
|
|
for dir in "${TRACK_DIRS[@]}"; do
|
|
echo "[$dir/]"
|
|
compress_dir "$dir"
|
|
done
|
|
|
|
echo ""
|
|
echo "Pruning stale files..."
|
|
prune_stale
|
|
|
|
if $DRY_RUN; then
|
|
echo ""
|
|
echo "(dry run — nothing written)"
|
|
exit 0
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== DVC add + push ==="
|
|
|
|
cd "$REPO_ROOT"
|
|
uvx --with 'dvc[s3]' dvc add .dvc-store/
|
|
|
|
echo ""
|
|
uvx --with 'dvc[s3]' dvc push
|
|
|
|
echo ""
|
|
echo "=== Done ==="
|
|
echo "Commit .dvc-store.dvc and .gitignore if changed:"
|
|
echo " git add .dvc-store.dvc .gitignore && git commit -m 'data: update dvc-tracked data'"
|