From 8e773d533530c9b8d34274854dcfbfd2c8d278e6 Mon Sep 17 00:00:00 2001 From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com> Date: Sun, 29 Mar 2026 01:15:37 -0400 Subject: [PATCH] deployment and minor tweaks --- .dockerignore | 23 ++++++++++ labelapp/Dockerfile | 60 +++++++++++++++++++++++++++ labelapp/app/label/page.tsx | 9 ++-- labelapp/entrypoint.sh | 38 +++++++++++++++++ labelapp/lib/quiz-questions.ts | 8 ++-- labelapp/next.config.ts | 4 +- labelapp/package.json | 5 ++- labelapp/playwright-report/index.html | 2 +- labelapp/scripts/assign.ts | 1 + labelapp/scripts/sample.ts | 2 + labelapp/scripts/seed.ts | 2 + 11 files changed, 142 insertions(+), 12 deletions(-) create mode 100644 .dockerignore create mode 100644 labelapp/Dockerfile create mode 100644 labelapp/entrypoint.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1fa5fa9 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,23 @@ +# Ignore everything by default +* + +# Allow only what the labelapp Dockerfile needs +!package.json +!bun.lock +!packages/schemas/ + +!ts/package.json +!labelapp/ +labelapp/node_modules/ +labelapp/.next/ +labelapp/.env* +labelapp/playwright-report/ +labelapp/test-results/ + +# Seed data (only the two JSONL files we need) +!data/paragraphs/paragraphs-clean.jsonl +!data/annotations/stage1.jsonl + +# Git/IDE +.git +**/.DS_Store diff --git a/labelapp/Dockerfile b/labelapp/Dockerfile new file mode 100644 index 0000000..ba8edce --- /dev/null +++ b/labelapp/Dockerfile @@ -0,0 +1,60 @@ +# Build context: monorepo root (run: docker build -f labelapp/Dockerfile .) +FROM oven/bun:1 AS base + +# -- Install dependencies -- +FROM base AS deps +WORKDIR /app +COPY package.json bun.lock ./ +COPY packages/schemas/package.json packages/schemas/ +COPY ts/package.json ts/ +COPY labelapp/package.json labelapp/ +RUN bun install --frozen-lockfile + +# -- Build Next.js -- +FROM base AS builder +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY --from=deps /app/packages/schemas/node_modules ./packages/schemas/node_modules +COPY --from=deps /app/labelapp/node_modules ./labelapp/node_modules +COPY package.json bun.lock ./ +COPY packages/schemas/ packages/schemas/ +COPY labelapp/ labelapp/ +ENV NEXT_TELEMETRY_DISABLED=1 +RUN cd labelapp && bun run build + +# -- Production image -- +FROM base AS runner +WORKDIR /app +ENV NODE_ENV=production +ENV NEXT_TELEMETRY_DISABLED=1 + +# Standalone server + static assets +COPY --from=builder /app/labelapp/.next/standalone ./ +COPY --from=builder /app/labelapp/.next/static ./labelapp/.next/static +COPY --from=builder /app/labelapp/public ./labelapp/public + +# Drizzle migration tooling (drizzle-kit push needs these) +COPY --from=deps /app/node_modules ./node_modules +COPY --from=deps /app/labelapp/node_modules ./labelapp/node_modules +COPY --from=builder /app/labelapp/drizzle.config.ts ./labelapp/ +COPY --from=builder /app/labelapp/db/ ./labelapp/db/ +COPY --from=builder /app/packages/schemas/ ./packages/schemas/ +COPY --from=builder /app/package.json ./ + +# Seed/sample/assign scripts +COPY --from=builder /app/labelapp/scripts/ ./labelapp/scripts/ +COPY --from=builder /app/labelapp/lib/ ./labelapp/lib/ + +# Seed data (paragraphs + stage1 annotations) +COPY data/paragraphs/paragraphs-clean.jsonl /app/data/paragraphs-clean.jsonl +COPY data/annotations/stage1.jsonl /app/data/stage1.jsonl + +# Entrypoint +COPY labelapp/entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh + +EXPOSE 3000 +ENV PORT=3000 +ENV HOSTNAME=0.0.0.0 + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/labelapp/app/label/page.tsx b/labelapp/app/label/page.tsx index b0b9585..08985ae 100644 --- a/labelapp/app/label/page.tsx +++ b/labelapp/app/label/page.tsx @@ -458,7 +458,7 @@ function CodebookSidebar() { - "Our CISO, Jane Smith" = named person (Firm-Specific). "Our - CISO" alone = function reference (could be Generic). + Is the paragraph about the person (credentials, background, + reporting lines) or the function (program activities, tools)? + Person → Management Role. Function → Risk Management Process. Need 2+ independently verifiable facts (dates, dollar amounts, @@ -495,7 +496,7 @@ function CodebookSidebar() { Choose the category whose content occupies the majority of the - paragraph. If truly 50/50, prefer the more specific category. + paragraph — the primary communicative purpose. diff --git a/labelapp/entrypoint.sh b/labelapp/entrypoint.sh new file mode 100644 index 0000000..e6241c0 --- /dev/null +++ b/labelapp/entrypoint.sh @@ -0,0 +1,38 @@ +#!/bin/bash +set -euo pipefail + +cd /app/labelapp + +echo "==> Running Drizzle migrations..." +bunx drizzle-kit push --force + +echo "==> Checking if database needs seeding..." +ROW_COUNT=$(bun --eval " +import postgres from 'postgres'; +const sql = postgres(process.env.DATABASE_URL); +const [{count}] = await sql\`SELECT count(*)::int as count FROM paragraphs\`; +console.log(count); +await sql.end(); +" 2>/dev/null || echo "0") + +if [ "$ROW_COUNT" = "0" ]; then + export SEED_PARAGRAPHS_PATH=/app/data/paragraphs-clean.jsonl + export SEED_ANNOTATIONS_PATH=/app/data/stage1.jsonl + export SAMPLED_IDS_PATH=/app/labelapp/.sampled-ids.json + + echo "==> Database is empty, seeding..." + bun run scripts/seed.ts + + echo "==> Running sampling..." + bun run scripts/sample.ts + + echo "==> Running assignment generation..." + bun run scripts/assign.ts + + echo "==> Seeding complete." +else + echo "==> Database already seeded ($ROW_COUNT paragraphs). Skipping." +fi + +echo "==> Starting Next.js server..." +exec bun run .next/standalone/labelapp/server.js diff --git a/labelapp/lib/quiz-questions.ts b/labelapp/lib/quiz-questions.ts index 0c51c8f..3e74174 100644 --- a/labelapp/lib/quiz-questions.ts +++ b/labelapp/lib/quiz-questions.ts @@ -256,12 +256,12 @@ export const QUIZ_QUESTIONS: QuizQuestion[] = [ id: "qv-1", type: "qv-counting", paragraphText: - "We maintain cyber liability insurance coverage.", + "Our CISO oversees a dedicated cybersecurity team responsible for managing cyber risk across the enterprise.", question: QV_QUESTION, options: QV_OPTIONS, correctAnswer: "3", explanation: - 'This mentions insurance but provides no verifiable details (no dollar amount, no named insurer). "Cyber liability insurance" is a firm-specific fact — it tells you this particular company holds this type of coverage — but there is only one such fact. One firm-specific fact without a named standard = Specificity 3 (Firm-Specific).', + '"CISO" is a cybersecurity-specific title on the codebook\'s IS list — that\'s one firm-specific fact. "Dedicated cybersecurity team" is a generic team reference (NOT list). "Managing cyber risk across the enterprise" is generic. One IS-list fact, no named standards, no QV-eligible facts = Specificity 3 (Firm-Specific).', }, { id: "qv-2", @@ -281,9 +281,9 @@ export const QUIZ_QUESTIONS: QuizQuestion[] = [ "Our incident response team conducts quarterly tabletop exercises.", question: QV_QUESTION, options: QV_OPTIONS, - correctAnswer: "3", + correctAnswer: "1", explanation: - 'Per the codebook, "quarterly" is a generic cadence and does NOT count as a specific fact for QV purposes. However, the mention of an "incident response team" and "tabletop exercises" indicates firm-specific activities. This has one firm-specific element but no hard verifiable facts (no named vendors, no dollar amounts, no exact dates). Specificity 3 (Firm-Specific).', + 'Apply the codebook\'s validation step: "quarterly" is a generic cadence (NOT list), "tabletop exercises" is a common practice (NOT list), and "incident response team" is a generic team reference (NOT list). After filtering, no IS-list facts remain. No named standards either. This is Specificity 1 (Generic Boilerplate) — it could appear unchanged in any company\'s filing.', }, { id: "qv-4", diff --git a/labelapp/next.config.ts b/labelapp/next.config.ts index e9ffa30..6ebc36f 100644 --- a/labelapp/next.config.ts +++ b/labelapp/next.config.ts @@ -1,7 +1,9 @@ import type { NextConfig } from "next"; +import path from "node:path"; const nextConfig: NextConfig = { - /* config options here */ + output: "standalone", + outputFileTracingRoot: path.join(import.meta.dirname, "../"), }; export default nextConfig; diff --git a/labelapp/package.json b/labelapp/package.json index 8cbfe3a..faf607a 100644 --- a/labelapp/package.json +++ b/labelapp/package.json @@ -16,7 +16,8 @@ "test": "bun test app/ lib/ && playwright test", "test:api": "bun test app/ lib/", "test:e2e": "playwright test", - "test:e2e:ui": "playwright test --ui" + "test:e2e:ui": "playwright test --ui", + "deploy": "docker build -f labelapp/Dockerfile -t registry.claiborne.soy/labelapp:latest .. --push" }, "dependencies": { "@base-ui/react": "^1.3.0", @@ -54,4 +55,4 @@ "sharp", "unrs-resolver" ] -} +} \ No newline at end of file diff --git a/labelapp/playwright-report/index.html b/labelapp/playwright-report/index.html index 577ba01..20b2ca2 100644 --- a/labelapp/playwright-report/index.html +++ b/labelapp/playwright-report/index.html @@ -82,4 +82,4 @@ Error generating stack: `+a.message+`
- \ No newline at end of file + \ No newline at end of file diff --git a/labelapp/scripts/assign.ts b/labelapp/scripts/assign.ts index 9719844..32f2aea 100644 --- a/labelapp/scripts/assign.ts +++ b/labelapp/scripts/assign.ts @@ -8,6 +8,7 @@ import * as schema from "../db/schema"; import { generateAssignments, printAssignmentStats } from "../lib/assignment"; const SAMPLED_IDS_PATH = + process.env.SAMPLED_IDS_PATH ?? "/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json"; async function main() { diff --git a/labelapp/scripts/sample.ts b/labelapp/scripts/sample.ts index 300fa99..bb98908 100644 --- a/labelapp/scripts/sample.ts +++ b/labelapp/scripts/sample.ts @@ -27,8 +27,10 @@ interface AnnotationRow { } const OUTPUT_PATH = + process.env.SAMPLED_IDS_PATH ?? "/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json"; const ANNOTATIONS_PATH = + process.env.SEED_ANNOTATIONS_PATH ?? "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl"; async function main() { diff --git a/labelapp/scripts/seed.ts b/labelapp/scripts/seed.ts index a7c75bf..e757153 100644 --- a/labelapp/scripts/seed.ts +++ b/labelapp/scripts/seed.ts @@ -102,8 +102,10 @@ function computeConsensus(annotations: AnnotationRow[]): { async function main() { const PARAGRAPHS_PATH = + process.env.SEED_PARAGRAPHS_PATH ?? "/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl"; const ANNOTATIONS_PATH = + process.env.SEED_ANNOTATIONS_PATH ?? "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl"; // 1. Read annotations and compute consensus per paragraph