From a5f06f2db77ddb0b94218181424ec77b9b2e17a7 Mon Sep 17 00:00:00 2001 From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com> Date: Sun, 5 Apr 2026 16:21:14 -0400 Subject: [PATCH] infra: migrate from DVC to Git LFS with xxh3 change detection Replace DVC pipeline with Git LFS on self-hosted Gitea. New scripts use per-file xxh3 hashing for change detection and parallel zstd-19 compression. Supports separate data/checkpoint push modes. --- .dvc-store.dvc | 6 - .dvc/.gitignore | 3 - .dvc/config | 9 -- .dvcignore | 3 - .gitattributes | 2 + .gitignore | 4 +- CLAUDE.md | 14 +- package.json | 6 +- scripts/data-pull.sh | 154 +++++++++++++++----- scripts/data-push.sh | 331 ++++++++++++++++++++++++++++++++----------- 10 files changed, 392 insertions(+), 140 deletions(-) delete mode 100644 .dvc-store.dvc delete mode 100644 .dvc/.gitignore delete mode 100644 .dvc/config delete mode 100644 .dvcignore create mode 100644 .gitattributes diff --git a/.dvc-store.dvc b/.dvc-store.dvc deleted file mode 100644 index 37cbd50..0000000 --- a/.dvc-store.dvc +++ /dev/null @@ -1,6 +0,0 @@ -outs: -- md5: b52c8929353b5ed374f10aab8c4e7837.dir - size: 753948666 - nfiles: 234 - hash: md5 - path: .dvc-store diff --git a/.dvc/.gitignore b/.dvc/.gitignore deleted file mode 100644 index 528f30c..0000000 --- a/.dvc/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/config.local -/tmp -/cache diff --git a/.dvc/config b/.dvc/config deleted file mode 100644 index 21df50b..0000000 --- a/.dvc/config +++ /dev/null @@ -1,9 +0,0 @@ -[core] - analytics = false - remote = r2 -['remote "r2"'] - url = s3://share/sec-cybert - endpointurl = https://0a665ba1f35a38354b3f623be13f14bd.r2.cloudflarestorage.com - region = auto -['remote "public"'] - url = https://share.lightningcode.dev/sec-cybert diff --git a/.dvcignore b/.dvcignore deleted file mode 100644 index 5197305..0000000 --- a/.dvcignore +++ /dev/null @@ -1,3 +0,0 @@ -# Add patterns of files dvc should ignore, which could improve -# the performance. Learn more at -# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..641ea14 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# LFS-tracked compressed store +.lfs-store/**/*.zst filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index f561072..265a5cd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,7 @@ -# Data (too large for git — managed by DVC) +# Data (working copies — compressed copies tracked via Git LFS in .lfs-store/) data/ models/ checkpoints/ -.dvc-store/ *.tar.zst # Dependencies @@ -55,5 +54,4 @@ unsloth_compiled_cache/ # Finder (MacOS) folder config .DS_Store python/*.whl -/.dvc-store diff --git a/CLAUDE.md b/CLAUDE.md index 31ff78b..fc05bc8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -58,14 +58,22 @@ All commands run from repo root via `bun run