deployment and minor tweaks

2026-03-29 01:15:37 -04:00 · 2026-03-29 01:15:37 -04:00 · 8e773d5335
commit 8e773d5335
parent 8c496ededa
11 changed files with 142 additions and 12 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,23 @@
 # Ignore everything by default
 *
 # Allow only what the labelapp Dockerfile needs
 !package.json
 !bun.lock
 !packages/schemas/
 !ts/package.json
 !labelapp/
 labelapp/node_modules/
 labelapp/.next/
 labelapp/.env*
 labelapp/playwright-report/
 labelapp/test-results/
 # Seed data (only the two JSONL files we need)
 !data/paragraphs/paragraphs-clean.jsonl
 !data/annotations/stage1.jsonl
 # Git/IDE
 .git
 **/.DS_Store
--- a/labelapp/Dockerfile
+++ b/labelapp/Dockerfile
@ -0,0 +1,60 @@
 # Build context: monorepo root (run: docker build -f labelapp/Dockerfile .)
 FROM oven/bun:1 AS base
 # -- Install dependencies --
 FROM base AS deps
 WORKDIR /app
 COPY package.json bun.lock ./
 COPY packages/schemas/package.json packages/schemas/
 COPY ts/package.json ts/
 COPY labelapp/package.json labelapp/
 RUN bun install --frozen-lockfile
 # -- Build Next.js --
 FROM base AS builder
 WORKDIR /app
 COPY --from=deps /app/node_modules ./node_modules
 COPY --from=deps /app/packages/schemas/node_modules ./packages/schemas/node_modules
 COPY --from=deps /app/labelapp/node_modules ./labelapp/node_modules
 COPY package.json bun.lock ./
 COPY packages/schemas/ packages/schemas/
 COPY labelapp/ labelapp/
 ENV NEXT_TELEMETRY_DISABLED=1
 RUN cd labelapp && bun run build
 # -- Production image --
 FROM base AS runner
 WORKDIR /app
 ENV NODE_ENV=production
 ENV NEXT_TELEMETRY_DISABLED=1
 # Standalone server + static assets
 COPY --from=builder /app/labelapp/.next/standalone ./
 COPY --from=builder /app/labelapp/.next/static ./labelapp/.next/static
 COPY --from=builder /app/labelapp/public ./labelapp/public
 # Drizzle migration tooling (drizzle-kit push needs these)
 COPY --from=deps /app/node_modules ./node_modules
 COPY --from=deps /app/labelapp/node_modules ./labelapp/node_modules
 COPY --from=builder /app/labelapp/drizzle.config.ts ./labelapp/
 COPY --from=builder /app/labelapp/db/ ./labelapp/db/
 COPY --from=builder /app/packages/schemas/ ./packages/schemas/
 COPY --from=builder /app/package.json ./
 # Seed/sample/assign scripts
 COPY --from=builder /app/labelapp/scripts/ ./labelapp/scripts/
 COPY --from=builder /app/labelapp/lib/ ./labelapp/lib/
 # Seed data (paragraphs + stage1 annotations)
 COPY data/paragraphs/paragraphs-clean.jsonl /app/data/paragraphs-clean.jsonl
 COPY data/annotations/stage1.jsonl /app/data/stage1.jsonl
 # Entrypoint
 COPY labelapp/entrypoint.sh /app/entrypoint.sh
 RUN chmod +x /app/entrypoint.sh
 EXPOSE 3000
 ENV PORT=3000
 ENV HOSTNAME=0.0.0.0
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/labelapp/app/label/page.tsx
+++ b/labelapp/app/label/page.tsx
@ -458,7 +458,7 @@ function CodebookSidebar() {
                <SpecDef
                  level={3}
                  name="Firm-Specific"
-                  desc="Contains details unique to this company: named personnel, specific org structure, named tools/vendors, described processes."
+                  desc="Contains at least one fact from the IS list unique to this company: cybersecurity-specific titles (CISO, CTO), named tools/vendors, specific dates, named committees."
                />
                <SpecDef
                  level={4}
@ -481,8 +481,9 @@ function CodebookSidebar() {
                  Governance. Named officer/team = Management Role.
                </Rule>
                <Rule title="Person vs Function">
-                  "Our CISO, Jane Smith" = named person (Firm-Specific). "Our
+                  Is the paragraph about the person (credentials, background,
-                  CISO" alone = function reference (could be Generic).
+                  reporting lines) or the function (program activities, tools)?
                  Person → Management Role. Function → Risk Management Process.
                </Rule>
                <Rule title="QV threshold">
                  Need 2+ independently verifiable facts (dates, dollar amounts,
@ -495,7 +496,7 @@ function CodebookSidebar() {
                </Rule>
                <Rule title="Dual-topic paragraphs">
                  Choose the category whose content occupies the majority of the
-                  paragraph. If truly 50/50, prefer the more specific category.
+                  paragraph — the primary communicative purpose.
                </Rule>
              </div>
            </section>
--- a/labelapp/entrypoint.sh
+++ b/labelapp/entrypoint.sh
@ -0,0 +1,38 @@
 #!/bin/bash
 set -euo pipefail
 cd /app/labelapp
 echo "==> Running Drizzle migrations..."
 bunx drizzle-kit push --force
 echo "==> Checking if database needs seeding..."
 ROW_COUNT=$(bun --eval "
 import postgres from 'postgres';
 const sql = postgres(process.env.DATABASE_URL);
 const [{count}] = await sql\`SELECT count(*)::int as count FROM paragraphs\`;
 console.log(count);
 await sql.end();
 " 2>/dev/null || echo "0")
 if [ "$ROW_COUNT" = "0" ]; then
  export SEED_PARAGRAPHS_PATH=/app/data/paragraphs-clean.jsonl
  export SEED_ANNOTATIONS_PATH=/app/data/stage1.jsonl
  export SAMPLED_IDS_PATH=/app/labelapp/.sampled-ids.json
  echo "==> Database is empty, seeding..."
  bun run scripts/seed.ts
  echo "==> Running sampling..."
  bun run scripts/sample.ts
  echo "==> Running assignment generation..."
  bun run scripts/assign.ts
  echo "==> Seeding complete."
 else
  echo "==> Database already seeded ($ROW_COUNT paragraphs). Skipping."
 fi
 echo "==> Starting Next.js server..."
 exec bun run .next/standalone/labelapp/server.js
--- a/labelapp/lib/quiz-questions.ts
+++ b/labelapp/lib/quiz-questions.ts
@ -256,12 +256,12 @@ export const QUIZ_QUESTIONS: QuizQuestion[] = [
    id: "qv-1",
    type: "qv-counting",
    paragraphText:
-      "We maintain cyber liability insurance coverage.",
+      "Our CISO oversees a dedicated cybersecurity team responsible for managing cyber risk across the enterprise.",
    question: QV_QUESTION,
    options: QV_OPTIONS,
    correctAnswer: "3",
    explanation:
-      'This mentions insurance but provides no verifiable details (no dollar amount, no named insurer). "Cyber liability insurance" is a firm-specific fact — it tells you this particular company holds this type of coverage — but there is only one such fact. One firm-specific fact without a named standard = Specificity 3 (Firm-Specific).',
+      '"CISO" is a cybersecurity-specific title on the codebook\'s IS list — that\'s one firm-specific fact. "Dedicated cybersecurity team" is a generic team reference (NOT list). "Managing cyber risk across the enterprise" is generic. One IS-list fact, no named standards, no QV-eligible facts = Specificity 3 (Firm-Specific).',
  },
  {
    id: "qv-2",
@ -281,9 +281,9 @@ export const QUIZ_QUESTIONS: QuizQuestion[] = [
      "Our incident response team conducts quarterly tabletop exercises.",
    question: QV_QUESTION,
    options: QV_OPTIONS,
-    correctAnswer: "3",
+    correctAnswer: "1",
    explanation:
-      'Per the codebook, "quarterly" is a generic cadence and does NOT count as a specific fact for QV purposes. However, the mention of an "incident response team" and "tabletop exercises" indicates firm-specific activities. This has one firm-specific element but no hard verifiable facts (no named vendors, no dollar amounts, no exact dates). Specificity 3 (Firm-Specific).',
+      'Apply the codebook\'s validation step: "quarterly" is a generic cadence (NOT list), "tabletop exercises" is a common practice (NOT list), and "incident response team" is a generic team reference (NOT list). After filtering, no IS-list facts remain. No named standards either. This is Specificity 1 (Generic Boilerplate) — it could appear unchanged in any company\'s filing.',
  },
  {
    id: "qv-4",
--- a/labelapp/next.config.ts
+++ b/labelapp/next.config.ts
@ -1,7 +1,9 @@
 import type { NextConfig } from "next";
 import path from "node:path";
 const nextConfig: NextConfig = {
-  /* config options here */
+  output: "standalone",
  outputFileTracingRoot: path.join(import.meta.dirname, "../"),
 };
 export default nextConfig;
--- a/labelapp/package.json
+++ b/labelapp/package.json
@ -16,7 +16,8 @@
    "test": "bun test app/ lib/ && playwright test",
    "test:api": "bun test app/ lib/",
    "test:e2e": "playwright test",
-    "test:e2e:ui": "playwright test --ui"
+    "test:e2e:ui": "playwright test --ui",
    "deploy": "docker build -f labelapp/Dockerfile -t registry.claiborne.soy/labelapp:latest .. --push"
  },
  "dependencies": {
    "@base-ui/react": "^1.3.0",
@ -54,4 +55,4 @@
    "sharp",
    "unrs-resolver"
  ]
-}
+}
--- a/labelapp/playwright-report/index.html
+++ b/labelapp/playwright-report/index.html
--- a/labelapp/scripts/assign.ts
+++ b/labelapp/scripts/assign.ts
@ -8,6 +8,7 @@ import * as schema from "../db/schema";
 import { generateAssignments, printAssignmentStats } from "../lib/assignment";
 const SAMPLED_IDS_PATH =
  process.env.SAMPLED_IDS_PATH ??
  "/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json";
 async function main() {
--- a/labelapp/scripts/sample.ts
+++ b/labelapp/scripts/sample.ts
@ -27,8 +27,10 @@ interface AnnotationRow {
 }
 const OUTPUT_PATH =
  process.env.SAMPLED_IDS_PATH ??
  "/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json";
 const ANNOTATIONS_PATH =
  process.env.SEED_ANNOTATIONS_PATH ??
  "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl";
 async function main() {
--- a/labelapp/scripts/seed.ts
+++ b/labelapp/scripts/seed.ts
@ -102,8 +102,10 @@ function computeConsensus(annotations: AnnotationRow[]): {
 async function main() {
  const PARAGRAPHS_PATH =
    process.env.SEED_PARAGRAPHS_PATH ??
    "/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl";
  const ANNOTATIONS_PATH =
    process.env.SEED_ANNOTATIONS_PATH ??
    "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl";
  // 1. Read annotations and compute consensus per paragraph