SEC-cyBERT/labelapp/scripts/dump-all.ts

/**
 * Comprehensive data dump from the labelapp database.
 *
 * Exports:
 *   data/gold/human-labels-raw.jsonl   — every individual label with timing
 *   data/gold/paragraphs-holdout.jsonl — paragraph metadata for the 1,200 holdout
 *   data/gold/annotators.json          — annotator profiles + onboarding timestamps
 *   data/gold/quiz-sessions.jsonl      — all quiz attempts
 *   data/gold/metrics.json             — comprehensive IRR: per-dimension alpha/kappa, pairwise matrices, per-category, per-stratum
 */

process.env.DATABASE_URL ??=
  "postgresql://sec_cybert:sec_cybert@10.1.10.10:5432/sec_cybert";

import { writeFile, mkdir } from "node:fs/promises";
import { existsSync } from "node:fs";
import { db } from "../db";
import * as schema from "../db/schema";
import {
  cohensKappa,
  krippendorffsAlpha,
  agreementRate,
  perCategoryAgreement,
} from "../lib/metrics";

const OUT_DIR = "/home/joey/Documents/sec-cyBERT/data/gold";

const CATEGORIES = [
  "Board Governance",
  "Management Role",
  "Risk Management Process",
  "Third-Party Risk",
  "Incident Disclosure",
  "Strategy Integration",
  "None/Other",
];

function toJSONL(records: object[]): string {
  return records.map((r) => JSON.stringify(r)).join("\n") + "\n";
}

async function main() {
  if (!existsSync(OUT_DIR)) await mkdir(OUT_DIR, { recursive: true });

  // ── Load everything ──
  console.log("Loading all data from database...");
  const [allLabels, allAnnotators, allParagraphs, allQuizSessions, allAdjudications] =
    await Promise.all([
      db.select().from(schema.humanLabels),
      db.select().from(schema.annotators),
      db.select().from(schema.paragraphs),
      db.select().from(schema.quizSessions),
      db.select().from(schema.adjudications),
    ]);

  const annotatorIds = allAnnotators.map((a) => a.id).sort();
  const annotatorNames = new Map(allAnnotators.map((a) => [a.id, a.displayName]));

  const labels = allLabels;

  console.log(`  ${labels.length} human labels`);
  console.log(`  ${allParagraphs.length} paragraphs`);
  console.log(`  ${allAnnotators.length} annotators`);
  console.log(`  ${allQuizSessions.length} quiz sessions`);
  console.log(`  ${allAdjudications.length} adjudications`);

  // ── 1. Raw labels JSONL ──
  console.log("\nExporting raw labels...");
  const rawLabels = labels.map((l) => ({
    paragraphId: l.paragraphId,
    annotatorId: l.annotatorId,
    annotatorName: annotatorNames.get(l.annotatorId) ?? l.annotatorId,
    contentCategory: l.contentCategory,
    specificityLevel: l.specificityLevel,
    notes: l.notes,
    labeledAt: l.labeledAt?.toISOString() ?? null,
    sessionId: l.sessionId,
    durationMs: l.durationMs,
    activeMs: l.activeMs,
  }));
  await writeFile(`${OUT_DIR}/human-labels-raw.jsonl`, toJSONL(rawLabels));
  console.log(`  ${rawLabels.length} labels → human-labels-raw.jsonl`);

  // ── 2. Paragraph metadata JSONL ──
  console.log("\nExporting paragraph metadata...");
  const paragraphRecords = allParagraphs.map((p) => ({
    id: p.id,
    text: p.text,
    wordCount: p.wordCount,
    paragraphIndex: p.paragraphIndex,
    companyName: p.companyName,
    cik: p.cik,
    ticker: p.ticker,
    filingType: p.filingType,
    filingDate: p.filingDate,
    fiscalYear: p.fiscalYear,
    accessionNumber: p.accessionNumber,
    secItem: p.secItem,
    stage1Category: p.stage1Category,
    stage1Specificity: p.stage1Specificity,
    stage1Method: p.stage1Method,
    stage1Confidence: p.stage1Confidence,
  }));
  await writeFile(`${OUT_DIR}/paragraphs-holdout.jsonl`, toJSONL(paragraphRecords));
  console.log(`  ${paragraphRecords.length} paragraphs → paragraphs-holdout.jsonl`);

  // ── 3. Annotators JSON ──
  console.log("\nExporting annotator profiles...");
  const annotatorProfiles = allAnnotators.map((a: { id: string; displayName: string; onboardedAt: Date | null }) => ({
    id: a.id,
    displayName: a.displayName,
    onboardedAt: a.onboardedAt?.toISOString() ?? null,
  }));
  await writeFile(`${OUT_DIR}/annotators.json`, JSON.stringify(annotatorProfiles, null, 2));
  console.log(`  ${annotatorProfiles.length} annotators → annotators.json`);

  // ── 4. Quiz sessions JSONL ──
  console.log("\nExporting quiz sessions...");
  const quizRecords = allQuizSessions.map((q) => ({
    id: q.id,
    annotatorId: q.annotatorId,
    annotatorName: annotatorNames.get(q.annotatorId) ?? q.annotatorId,
    startedAt: q.startedAt?.toISOString() ?? null,
    completedAt: q.completedAt?.toISOString() ?? null,
    passed: q.passed,
    score: q.score,
    totalQuestions: q.totalQuestions,
    answers: q.answers,
  }));
  await writeFile(`${OUT_DIR}/quiz-sessions.jsonl`, toJSONL(quizRecords));
  console.log(`  ${quizRecords.length} quiz sessions → quiz-sessions.jsonl`);

  // ── 5. Comprehensive metrics ──
  console.log("\nComputing metrics...");

  // Group labels by paragraph
  const byParagraph = new Map<string, typeof labels>();
  for (const label of labels) {
    const group = byParagraph.get(label.paragraphId);
    if (group) group.push(label);
    else byParagraph.set(label.paragraphId, [label]);
  }

  // Only paragraphs with 3+ labels
  const fullyLabeled = new Map<string, typeof labels>();
  for (const [pid, lbls] of byParagraph) {
    if (lbls.length >= 3) fullyLabeled.set(pid, lbls);
  }

  // Paragraphs with 2+ labels (for pairwise)
  const multiLabeled = new Map<string, typeof labels>();
  for (const [pid, lbls] of byParagraph) {
    if (lbls.length >= 2) multiLabeled.set(pid, lbls);
  }

  const multiLabeledParaIds = [...multiLabeled.keys()];

  // ─── Per-annotator stats ───
  const perAnnotatorStats = annotatorIds.map((aid) => {
    const myLabels = labels.filter((l) => l.annotatorId === aid);
    const activeTimes = myLabels
      .map((l) => l.activeMs)
      .filter((t): t is number => t !== null);
    const wallTimes = myLabels
      .map((l) => l.durationMs)
      .filter((t): t is number => t !== null);
    return {
      id: aid,
      name: annotatorNames.get(aid) ?? aid,
      labelCount: myLabels.length,
      medianActiveMs: activeTimes.length > 0 ? median(activeTimes) : null,
      meanActiveMs: activeTimes.length > 0 ? mean(activeTimes) : null,
      medianDurationMs: wallTimes.length > 0 ? median(wallTimes) : null,
      meanDurationMs: wallTimes.length > 0 ? mean(wallTimes) : null,
      totalActiveMs: activeTimes.length > 0 ? sum(activeTimes) : null,
      totalDurationMs: wallTimes.length > 0 ? sum(wallTimes) : null,
      labelsWithActiveTime: activeTimes.length,
    };
  });

  // ─── Category consensus ───
  const categoryArrays: string[][] = [];
  for (const lbls of fullyLabeled.values()) {
    categoryArrays.push(lbls.map((l) => l.contentCategory));
  }
  const categoryConsensusRate = agreementRate(categoryArrays);

  // ─── Specificity consensus ───
  const specArrays: string[][] = [];
  for (const lbls of fullyLabeled.values()) {
    specArrays.push(lbls.map((l) => String(l.specificityLevel)));
  }
  const specConsensusRate = agreementRate(specArrays);

  // ─── Both consensus ───
  const bothArrays: string[][] = [];
  for (const lbls of fullyLabeled.values()) {
    bothArrays.push(
      lbls.map((l) => `${l.contentCategory}|${l.specificityLevel}`),
    );
  }
  const bothConsensusRate = agreementRate(bothArrays);

  // ─── Krippendorff's Alpha: category (nominal, use ordinal distance = 0/1) ───
  // We encode categories as integers for alpha computation
  const catIndex = new Map(CATEGORIES.map((c, i) => [c, i + 1]));

  const categoryRatingsMatrix: (number | null)[][] = annotatorIds.map(
    (annotatorId) =>
      multiLabeledParaIds.map((paraId) => {
        const label = multiLabeled
          .get(paraId)
          ?.find((l) => l.annotatorId === annotatorId);
        if (!label) return null;
        return catIndex.get(label.contentCategory) ?? null;
      }),
  );

  // Krippendorff's alpha for category (note: using ordinal distance on nominal data
  // — this is conservative; nominal distance would give higher alpha)
  const categoryAlpha =
    annotatorIds.length >= 2 && multiLabeledParaIds.length > 0
      ? krippendorffsAlpha(categoryRatingsMatrix)
      : 0;

  // ─── Krippendorff's Alpha: specificity (ordinal) ───
  const specRatingsMatrix: (number | null)[][] = annotatorIds.map(
    (annotatorId) =>
      multiLabeledParaIds.map((paraId) => {
        const label = multiLabeled
          .get(paraId)
          ?.find((l) => l.annotatorId === annotatorId);
        return label?.specificityLevel ?? null;
      }),
  );

  const specAlpha =
    annotatorIds.length >= 2 && multiLabeledParaIds.length > 0
      ? krippendorffsAlpha(specRatingsMatrix)
      : 0;

  // ─── Pairwise Cohen's Kappa — category ───
  const kappaCategory: number[][] = Array.from(
    { length: annotatorIds.length },
    () => new Array(annotatorIds.length).fill(0),
  );
  const kappaCatDetails: {
    a1: string;
    a2: string;
    kappa: number;
    n: number;
  }[] = [];

  for (let i = 0; i < annotatorIds.length; i++) {
    kappaCategory[i][i] = 1;
    for (let j = i + 1; j < annotatorIds.length; j++) {
      const a1 = annotatorIds[i];
      const a2 = annotatorIds[j];
      const shared1: string[] = [];
      const shared2: string[] = [];

      for (const [, lbls] of multiLabeled) {
        const l1 = lbls.find((l) => l.annotatorId === a1);
        const l2 = lbls.find((l) => l.annotatorId === a2);
        if (l1 && l2) {
          shared1.push(l1.contentCategory);
          shared2.push(l2.contentCategory);
        }
      }

      if (shared1.length >= 2) {
        const kappa = cohensKappa(shared1, shared2);
        kappaCategory[i][j] = kappa;
        kappaCategory[j][i] = kappa;
        kappaCatDetails.push({
          a1: annotatorNames.get(a1) ?? a1,
          a2: annotatorNames.get(a2) ?? a2,
          kappa,
          n: shared1.length,
        });
      }
    }
  }

  // ─── Pairwise Cohen's Kappa — specificity ───
  const kappaSpec: number[][] = Array.from(
    { length: annotatorIds.length },
    () => new Array(annotatorIds.length).fill(0),
  );
  const kappaSpecDetails: {
    a1: string;
    a2: string;
    kappa: number;
    n: number;
  }[] = [];

  for (let i = 0; i < annotatorIds.length; i++) {
    kappaSpec[i][i] = 1;
    for (let j = i + 1; j < annotatorIds.length; j++) {
      const a1 = annotatorIds[i];
      const a2 = annotatorIds[j];
      const shared1: string[] = [];
      const shared2: string[] = [];

      for (const [, lbls] of multiLabeled) {
        const l1 = lbls.find((l) => l.annotatorId === a1);
        const l2 = lbls.find((l) => l.annotatorId === a2);
        if (l1 && l2) {
          shared1.push(String(l1.specificityLevel));
          shared2.push(String(l2.specificityLevel));
        }
      }

      if (shared1.length >= 2) {
        const kappa = cohensKappa(shared1, shared2);
        kappaSpec[i][j] = kappa;
        kappaSpec[j][i] = kappa;
        kappaSpecDetails.push({
          a1: annotatorNames.get(a1) ?? a1,
          a2: annotatorNames.get(a2) ?? a2,
          kappa,
          n: shared1.length,
        });
      }
    }
  }

  // ─── Per-category agreement ───
  const perCategory = perCategoryAgreement(
    labels.map((l) => ({
      category: l.contentCategory,
      annotatorId: l.annotatorId,
      paragraphId: l.paragraphId,
    })),
    CATEGORIES,
  );

  // ─── Per-stratum agreement (using stage1 data to identify strata) ───
  const paragraphMeta = new Map(allParagraphs.map((p) => [p.id, p]));

  // Classify each paragraph's stratum based on stage1 data
  function classifyStratum(pid: string): string {
    const para = paragraphMeta.get(pid);
    if (!para) return "unknown";
    const method = para.stage1Method;
    const cat = para.stage1Category;
    const spec = para.stage1Specificity;

    // Check if it was a disputed paragraph based on method
    if (method === "unresolved") return "unresolved";
    if (method === "majority") {
      // Try to identify the dispute type from the category
      if (cat === "Management Role" || cat === "Risk Management Process")
        return "mgmt_rmp_split";
      if (cat === "None/Other" || cat === "Strategy Integration")
        return "noneother_strategy_split";
      if (cat === "Board Governance") return "board_mgmt_split";
      if (spec === 3 || spec === 4) return "spec_34_split";
      return "majority_other";
    }
    if (method === "unanimous") return "unanimous";
    return "proportional_random";
  }

  const strataAgreement: Record<string, { total: number; agreed: number }> = {};
  for (const [pid, lbls] of fullyLabeled) {
    const stratum = classifyStratum(pid);
    if (!strataAgreement[stratum]) {
      strataAgreement[stratum] = { total: 0, agreed: 0 };
    }
    strataAgreement[stratum].total++;
    const allSameCat = lbls.every(
      (l) => l.contentCategory === lbls[0].contentCategory,
    );
    const allSameSpec = lbls.every(
      (l) => l.specificityLevel === lbls[0].specificityLevel,
    );
    if (allSameCat && allSameSpec) strataAgreement[stratum].agreed++;
  }

  const strataRates: Record<string, { total: number; agreed: number; rate: number }> = {};
  for (const [stratum, data] of Object.entries(strataAgreement)) {
    strataRates[stratum] = {
      ...data,
      rate: data.total > 0 ? data.agreed / data.total : 0,
    };
  }

  // ─── Timing summary ───
  const allActiveTimes = labels
    .map((l) => l.activeMs)
    .filter((t): t is number => t !== null);
  const allWallTimes = labels
    .map((l) => l.durationMs)
    .filter((t): t is number => t !== null);

  // ─── Category distribution ───
  const categoryDist: Record<string, number> = {};
  for (const cat of CATEGORIES) categoryDist[cat] = 0;
  for (const l of labels) {
    categoryDist[l.contentCategory] =
      (categoryDist[l.contentCategory] ?? 0) + 1;
  }

  // ─── Specificity distribution ───
  const specDist: Record<string, number> = { "1": 0, "2": 0, "3": 0, "4": 0 };
  for (const l of labels) {
    specDist[String(l.specificityLevel)] =
      (specDist[String(l.specificityLevel)] ?? 0) + 1;
  }

  // ─── Majority label distribution (for fully-labeled paragraphs) ───
  const majorityCategories: Record<string, number> = {};
  for (const cat of CATEGORIES) majorityCategories[cat] = 0;

  for (const lbls of fullyLabeled.values()) {
    const catCounts = new Map<string, number>();
    for (const l of lbls) {
      catCounts.set(l.contentCategory, (catCounts.get(l.contentCategory) ?? 0) + 1);
    }
    let maxCount = 0;
    let majorCat = "";
    for (const [cat, count] of catCounts) {
      if (count > maxCount) {
        maxCount = count;
        majorCat = cat;
      }
    }
    if (majorCat) majorityCategories[majorCat]++;
  }

  const metrics = {
    summary: {
      totalLabels: labels.length,
      totalParagraphs: allParagraphs.length,
      fullyLabeledParagraphs: fullyLabeled.size,
      adjudicatedParagraphs: allAdjudications.length,
      annotatorCount: annotatorIds.length,
    },
    consensus: {
      categoryOnly: round(categoryConsensusRate, 4),
      specificityOnly: round(specConsensusRate, 4),
      both: round(bothConsensusRate, 4),
    },
    krippendorffsAlpha: {
      category: round(categoryAlpha, 4),
      specificity: round(specAlpha, 4),
      note: "Category alpha uses ordinal distance on nominal data (conservative). Specificity alpha uses ordinal distance.",
    },
    pairwiseKappa: {
      category: {
        annotators: annotatorIds.map((id) => annotatorNames.get(id) ?? id),
        matrix: kappaCategory.map((row) => row.map((v) => round(v, 4))),
        pairs: kappaCatDetails.map((d) => ({
          ...d,
          kappa: round(d.kappa, 4),
        })),
        mean: round(
          kappaCatDetails.length > 0
            ? kappaCatDetails.reduce((s, d) => s + d.kappa, 0) /
                kappaCatDetails.length
            : 0,
          4,
        ),
      },
      specificity: {
        annotators: annotatorIds.map((id) => annotatorNames.get(id) ?? id),
        matrix: kappaSpec.map((row) => row.map((v) => round(v, 4))),
        pairs: kappaSpecDetails.map((d) => ({
          ...d,
          kappa: round(d.kappa, 4),
        })),
        mean: round(
          kappaSpecDetails.length > 0
            ? kappaSpecDetails.reduce((s, d) => s + d.kappa, 0) /
                kappaSpecDetails.length
            : 0,
          4,
        ),
      },
    },
    perCategoryAgreement: Object.fromEntries(
      Object.entries(perCategory).map(([k, v]) => [k, round(v, 4)]),
    ),
    perStratumAgreement: strataRates,
    distributions: {
      categoryLabels: categoryDist,
      specificityLabels: specDist,
      majorityCategories,
    },
    timing: {
      overallMedianActiveMs: allActiveTimes.length > 0 ? median(allActiveTimes) : null,
      overallMeanActiveMs: allActiveTimes.length > 0 ? round(mean(allActiveTimes), 0) : null,
      overallMedianDurationMs: allWallTimes.length > 0 ? median(allWallTimes) : null,
      overallMeanDurationMs: allWallTimes.length > 0 ? round(mean(allWallTimes), 0) : null,
      totalActiveHours:
        allActiveTimes.length > 0
          ? round(sum(allActiveTimes) / 3_600_000, 2)
          : null,
      totalWallHours:
        allWallTimes.length > 0
          ? round(sum(allWallTimes) / 3_600_000, 2)
          : null,
      labelsWithActiveTime: allActiveTimes.length,
      labelsWithoutActiveTime: labels.length - allActiveTimes.length,
    },
    perAnnotator: perAnnotatorStats,
  };

  await writeFile(`${OUT_DIR}/metrics.json`, JSON.stringify(metrics, null, 2));
  console.log(`  metrics → metrics.json`);

  // ── Print summary to console ──
  console.log("\n" + "=".repeat(60));
  console.log("HUMAN LABELING SUMMARY");
  console.log("=".repeat(60));
  console.log(`\nParagraphs: ${fullyLabeled.size} fully labeled / ${allParagraphs.length} total`);
  console.log(`Labels: ${labels.length} total`);
  console.log(`\n── Consensus Rates (3/3 agree) ──`);
  console.log(`  Category only:    ${(categoryConsensusRate * 100).toFixed(1)}%`);
  console.log(`  Specificity only: ${(specConsensusRate * 100).toFixed(1)}%`);
  console.log(`  Both:             ${(bothConsensusRate * 100).toFixed(1)}%`);
  console.log(`\n── Krippendorff's Alpha ──`);
  console.log(`  Category:    ${categoryAlpha.toFixed(4)}`);
  console.log(`  Specificity: ${specAlpha.toFixed(4)}`);
  console.log(`\n── Pairwise Kappa (category) ──`);
  console.log(`  Mean: ${metrics.pairwiseKappa.category.mean}`);
  for (const pair of kappaCatDetails) {
    console.log(`  ${pair.a1} × ${pair.a2}: ${pair.kappa.toFixed(4)} (n=${pair.n})`);
  }
  console.log(`\n── Pairwise Kappa (specificity) ──`);
  console.log(`  Mean: ${metrics.pairwiseKappa.specificity.mean}`);
  for (const pair of kappaSpecDetails) {
    console.log(`  ${pair.a1} × ${pair.a2}: ${pair.kappa.toFixed(4)} (n=${pair.n})`);
  }
  console.log(`\n── Per-Category Agreement ──`);
  for (const [cat, rate] of Object.entries(perCategory)) {
    console.log(`  ${cat}: ${(rate * 100).toFixed(1)}%`);
  }
  console.log(`\n── Per-Stratum Agreement ──`);
  for (const [stratum, data] of Object.entries(strataRates)) {
    console.log(
      `  ${stratum}: ${(data.rate * 100).toFixed(1)}% (${data.agreed}/${data.total})`,
    );
  }
  console.log(`\n── Timing ──`);
  if (allActiveTimes.length > 0) {
    console.log(`  Median active time: ${(median(allActiveTimes) / 1000).toFixed(1)}s`);
    console.log(`  Mean active time:   ${(mean(allActiveTimes) / 1000).toFixed(1)}s`);
    console.log(`  Total active hours: ${(sum(allActiveTimes) / 3_600_000).toFixed(2)}h`);
    console.log(`  Total wall hours:   ${(sum(allWallTimes) / 3_600_000).toFixed(2)}h`);
  }
  console.log(`  Labels with active timer: ${allActiveTimes.length}/${labels.length}`);

  console.log(`\n── Per-Annotator ──`);
  for (const a of perAnnotatorStats) {
    const activeH = a.totalActiveMs ? (a.totalActiveMs / 3_600_000).toFixed(2) : "N/A";
    const medSec = a.medianActiveMs ? (a.medianActiveMs / 1000).toFixed(1) : "N/A";
    console.log(
      `  ${a.name}: ${a.labelCount} labels, median ${medSec}s active, ${activeH}h total`,
    );
  }

  console.log(`\n${"=".repeat(60)}`);
  console.log(`All data exported to ${OUT_DIR}/`);
  console.log("=".repeat(60));

  process.exit(0);
}

function median(arr: number[]): number {
  const sorted = [...arr].sort((a, b) => a - b);
  const mid = Math.floor(sorted.length / 2);
  return sorted.length % 2 !== 0
    ? sorted[mid]
    : (sorted[mid - 1] + sorted[mid]) / 2;
}

function mean(arr: number[]): number {
  return arr.reduce((s, v) => s + v, 0) / arr.length;
}

function sum(arr: number[]): number {
  return arr.reduce((s, v) => s + v, 0);
}

function round(n: number, decimals: number): number {
  const factor = 10 ** decimals;
  return Math.round(n * factor) / factor;
}

main().catch((err) => {
  console.error("Dump failed:", err);
  process.exit(1);
});