From c0273c9e2ed5cd09eb65a74a9bc43f7a9abcc7b5 Mon Sep 17 00:00:00 2001 From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:53:35 -0400 Subject: [PATCH] adding dvc backend so data can be cleanly pulled --- .dvc-store.dvc | 6 ++ .dvc/.gitignore | 3 + .dvc/config | 9 +++ .dvcignore | 3 + .env.example | 10 +++ .gitignore | 5 +- CLAUDE.md | 8 +++ README.md | 149 ++++++++++++++++++++++++++++++++++++++++ package.json | 5 +- python/pyproject.toml | 3 + scripts/data-pull.sh | 57 +++++++++++++++ scripts/data-push.sh | 119 ++++++++++++++++++++++++++++++++ scripts/package-data.sh | 85 +++++++++++++++++++++++ 13 files changed, 460 insertions(+), 2 deletions(-) create mode 100644 .dvc-store.dvc create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 .env.example create mode 100644 README.md create mode 100755 scripts/data-pull.sh create mode 100755 scripts/data-push.sh create mode 100755 scripts/package-data.sh diff --git a/.dvc-store.dvc b/.dvc-store.dvc new file mode 100644 index 0000000..beaedb2 --- /dev/null +++ b/.dvc-store.dvc @@ -0,0 +1,6 @@ +outs: +- md5: c633654a20f23d76af34689f7e27d58a.dir + size: 729964105 + nfiles: 111 + hash: md5 + path: .dvc-store diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..21df50b --- /dev/null +++ b/.dvc/config @@ -0,0 +1,9 @@ +[core] + analytics = false + remote = r2 +['remote "r2"'] + url = s3://share/sec-cybert + endpointurl = https://0a665ba1f35a38354b3f623be13f14bd.r2.cloudflarestorage.com + region = auto +['remote "public"'] + url = https://share.lightningcode.dev/sec-cybert diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..731f10e --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +# OpenRouter (GenAI labeling pipeline) +OPENROUTER_API_KEY="" + +# Cloudflare R2 (DVC data storage) +R2_BUCKET="share" +R2_ENDPOINT="https://0a665ba1f35a38354b3f623be13f14bd.r2.cloudflarestorage.com" +R2_PUBLIC_URL="https://share.lightningcode.dev" +R2_API_TOKEN="" +R2_ACCESS_KEY_ID="" +R2_SECRET_ACCESS_KEY="" diff --git a/.gitignore b/.gitignore index 425ee1a..97532fb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ -# Data (too large for git) +# Data (too large for git — managed by DVC) data/ models/ checkpoints/ +.dvc-store/ +*.tar.zst # Dependencies ts/node_modules/ @@ -52,3 +54,4 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store python/*.whl +/.dvc-store diff --git a/CLAUDE.md b/CLAUDE.md index a35aeb3..78379c2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -55,6 +55,14 @@ All commands run from repo root via `bun run