2026-04-05 15:37:50 -04:00

596 lines
21 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Comprehensive data dump from the labelapp database.
*
* Exports:
* data/gold/human-labels-raw.jsonl — every individual label with timing
* data/gold/paragraphs-holdout.jsonl — paragraph metadata for the 1,200 holdout
* data/gold/annotators.json — annotator profiles + onboarding timestamps
* data/gold/quiz-sessions.jsonl — all quiz attempts
* data/gold/metrics.json — comprehensive IRR: per-dimension alpha/kappa, pairwise matrices, per-category, per-stratum
*/
process.env.DATABASE_URL ??=
"postgresql://sec_cybert:sec_cybert@10.1.10.10:5432/sec_cybert";
import { writeFile, mkdir } from "node:fs/promises";
import { existsSync } from "node:fs";
import { db } from "../db";
import * as schema from "../db/schema";
import {
cohensKappa,
krippendorffsAlpha,
agreementRate,
perCategoryAgreement,
} from "../lib/metrics";
const OUT_DIR = "/home/joey/Documents/sec-cyBERT/data/gold";
const CATEGORIES = [
"Board Governance",
"Management Role",
"Risk Management Process",
"Third-Party Risk",
"Incident Disclosure",
"Strategy Integration",
"None/Other",
];
function toJSONL(records: object[]): string {
return records.map((r) => JSON.stringify(r)).join("\n") + "\n";
}
async function main() {
if (!existsSync(OUT_DIR)) await mkdir(OUT_DIR, { recursive: true });
// ── Load everything ──
console.log("Loading all data from database...");
const [allLabels, allAnnotators, allParagraphs, allQuizSessions, allAdjudications] =
await Promise.all([
db.select().from(schema.humanLabels),
db.select().from(schema.annotators),
db.select().from(schema.paragraphs),
db.select().from(schema.quizSessions),
db.select().from(schema.adjudications),
]);
const annotatorIds = allAnnotators.map((a) => a.id).sort();
const annotatorNames = new Map(allAnnotators.map((a) => [a.id, a.displayName]));
const labels = allLabels;
console.log(` ${labels.length} human labels`);
console.log(` ${allParagraphs.length} paragraphs`);
console.log(` ${allAnnotators.length} annotators`);
console.log(` ${allQuizSessions.length} quiz sessions`);
console.log(` ${allAdjudications.length} adjudications`);
// ── 1. Raw labels JSONL ──
console.log("\nExporting raw labels...");
const rawLabels = labels.map((l) => ({
paragraphId: l.paragraphId,
annotatorId: l.annotatorId,
annotatorName: annotatorNames.get(l.annotatorId) ?? l.annotatorId,
contentCategory: l.contentCategory,
specificityLevel: l.specificityLevel,
notes: l.notes,
labeledAt: l.labeledAt?.toISOString() ?? null,
sessionId: l.sessionId,
durationMs: l.durationMs,
activeMs: l.activeMs,
}));
await writeFile(`${OUT_DIR}/human-labels-raw.jsonl`, toJSONL(rawLabels));
console.log(` ${rawLabels.length} labels → human-labels-raw.jsonl`);
// ── 2. Paragraph metadata JSONL ──
console.log("\nExporting paragraph metadata...");
const paragraphRecords = allParagraphs.map((p) => ({
id: p.id,
text: p.text,
wordCount: p.wordCount,
paragraphIndex: p.paragraphIndex,
companyName: p.companyName,
cik: p.cik,
ticker: p.ticker,
filingType: p.filingType,
filingDate: p.filingDate,
fiscalYear: p.fiscalYear,
accessionNumber: p.accessionNumber,
secItem: p.secItem,
stage1Category: p.stage1Category,
stage1Specificity: p.stage1Specificity,
stage1Method: p.stage1Method,
stage1Confidence: p.stage1Confidence,
}));
await writeFile(`${OUT_DIR}/paragraphs-holdout.jsonl`, toJSONL(paragraphRecords));
console.log(` ${paragraphRecords.length} paragraphs → paragraphs-holdout.jsonl`);
// ── 3. Annotators JSON ──
console.log("\nExporting annotator profiles...");
const annotatorProfiles = allAnnotators.map((a: { id: string; displayName: string; onboardedAt: Date | null }) => ({
id: a.id,
displayName: a.displayName,
onboardedAt: a.onboardedAt?.toISOString() ?? null,
}));
await writeFile(`${OUT_DIR}/annotators.json`, JSON.stringify(annotatorProfiles, null, 2));
console.log(` ${annotatorProfiles.length} annotators → annotators.json`);
// ── 4. Quiz sessions JSONL ──
console.log("\nExporting quiz sessions...");
const quizRecords = allQuizSessions.map((q) => ({
id: q.id,
annotatorId: q.annotatorId,
annotatorName: annotatorNames.get(q.annotatorId) ?? q.annotatorId,
startedAt: q.startedAt?.toISOString() ?? null,
completedAt: q.completedAt?.toISOString() ?? null,
passed: q.passed,
score: q.score,
totalQuestions: q.totalQuestions,
answers: q.answers,
}));
await writeFile(`${OUT_DIR}/quiz-sessions.jsonl`, toJSONL(quizRecords));
console.log(` ${quizRecords.length} quiz sessions → quiz-sessions.jsonl`);
// ── 5. Comprehensive metrics ──
console.log("\nComputing metrics...");
// Group labels by paragraph
const byParagraph = new Map<string, typeof labels>();
for (const label of labels) {
const group = byParagraph.get(label.paragraphId);
if (group) group.push(label);
else byParagraph.set(label.paragraphId, [label]);
}
// Only paragraphs with 3+ labels
const fullyLabeled = new Map<string, typeof labels>();
for (const [pid, lbls] of byParagraph) {
if (lbls.length >= 3) fullyLabeled.set(pid, lbls);
}
// Paragraphs with 2+ labels (for pairwise)
const multiLabeled = new Map<string, typeof labels>();
for (const [pid, lbls] of byParagraph) {
if (lbls.length >= 2) multiLabeled.set(pid, lbls);
}
const multiLabeledParaIds = [...multiLabeled.keys()];
// ─── Per-annotator stats ───
const perAnnotatorStats = annotatorIds.map((aid) => {
const myLabels = labels.filter((l) => l.annotatorId === aid);
const activeTimes = myLabels
.map((l) => l.activeMs)
.filter((t): t is number => t !== null);
const wallTimes = myLabels
.map((l) => l.durationMs)
.filter((t): t is number => t !== null);
return {
id: aid,
name: annotatorNames.get(aid) ?? aid,
labelCount: myLabels.length,
medianActiveMs: activeTimes.length > 0 ? median(activeTimes) : null,
meanActiveMs: activeTimes.length > 0 ? mean(activeTimes) : null,
medianDurationMs: wallTimes.length > 0 ? median(wallTimes) : null,
meanDurationMs: wallTimes.length > 0 ? mean(wallTimes) : null,
totalActiveMs: activeTimes.length > 0 ? sum(activeTimes) : null,
totalDurationMs: wallTimes.length > 0 ? sum(wallTimes) : null,
labelsWithActiveTime: activeTimes.length,
};
});
// ─── Category consensus ───
const categoryArrays: string[][] = [];
for (const lbls of fullyLabeled.values()) {
categoryArrays.push(lbls.map((l) => l.contentCategory));
}
const categoryConsensusRate = agreementRate(categoryArrays);
// ─── Specificity consensus ───
const specArrays: string[][] = [];
for (const lbls of fullyLabeled.values()) {
specArrays.push(lbls.map((l) => String(l.specificityLevel)));
}
const specConsensusRate = agreementRate(specArrays);
// ─── Both consensus ───
const bothArrays: string[][] = [];
for (const lbls of fullyLabeled.values()) {
bothArrays.push(
lbls.map((l) => `${l.contentCategory}|${l.specificityLevel}`),
);
}
const bothConsensusRate = agreementRate(bothArrays);
// ─── Krippendorff's Alpha: category (nominal, use ordinal distance = 0/1) ───
// We encode categories as integers for alpha computation
const catIndex = new Map(CATEGORIES.map((c, i) => [c, i + 1]));
const categoryRatingsMatrix: (number | null)[][] = annotatorIds.map(
(annotatorId) =>
multiLabeledParaIds.map((paraId) => {
const label = multiLabeled
.get(paraId)
?.find((l) => l.annotatorId === annotatorId);
if (!label) return null;
return catIndex.get(label.contentCategory) ?? null;
}),
);
// Krippendorff's alpha for category (note: using ordinal distance on nominal data
// — this is conservative; nominal distance would give higher alpha)
const categoryAlpha =
annotatorIds.length >= 2 && multiLabeledParaIds.length > 0
? krippendorffsAlpha(categoryRatingsMatrix)
: 0;
// ─── Krippendorff's Alpha: specificity (ordinal) ───
const specRatingsMatrix: (number | null)[][] = annotatorIds.map(
(annotatorId) =>
multiLabeledParaIds.map((paraId) => {
const label = multiLabeled
.get(paraId)
?.find((l) => l.annotatorId === annotatorId);
return label?.specificityLevel ?? null;
}),
);
const specAlpha =
annotatorIds.length >= 2 && multiLabeledParaIds.length > 0
? krippendorffsAlpha(specRatingsMatrix)
: 0;
// ─── Pairwise Cohen's Kappa — category ───
const kappaCategory: number[][] = Array.from(
{ length: annotatorIds.length },
() => new Array(annotatorIds.length).fill(0),
);
const kappaCatDetails: {
a1: string;
a2: string;
kappa: number;
n: number;
}[] = [];
for (let i = 0; i < annotatorIds.length; i++) {
kappaCategory[i][i] = 1;
for (let j = i + 1; j < annotatorIds.length; j++) {
const a1 = annotatorIds[i];
const a2 = annotatorIds[j];
const shared1: string[] = [];
const shared2: string[] = [];
for (const [, lbls] of multiLabeled) {
const l1 = lbls.find((l) => l.annotatorId === a1);
const l2 = lbls.find((l) => l.annotatorId === a2);
if (l1 && l2) {
shared1.push(l1.contentCategory);
shared2.push(l2.contentCategory);
}
}
if (shared1.length >= 2) {
const kappa = cohensKappa(shared1, shared2);
kappaCategory[i][j] = kappa;
kappaCategory[j][i] = kappa;
kappaCatDetails.push({
a1: annotatorNames.get(a1) ?? a1,
a2: annotatorNames.get(a2) ?? a2,
kappa,
n: shared1.length,
});
}
}
}
// ─── Pairwise Cohen's Kappa — specificity ───
const kappaSpec: number[][] = Array.from(
{ length: annotatorIds.length },
() => new Array(annotatorIds.length).fill(0),
);
const kappaSpecDetails: {
a1: string;
a2: string;
kappa: number;
n: number;
}[] = [];
for (let i = 0; i < annotatorIds.length; i++) {
kappaSpec[i][i] = 1;
for (let j = i + 1; j < annotatorIds.length; j++) {
const a1 = annotatorIds[i];
const a2 = annotatorIds[j];
const shared1: string[] = [];
const shared2: string[] = [];
for (const [, lbls] of multiLabeled) {
const l1 = lbls.find((l) => l.annotatorId === a1);
const l2 = lbls.find((l) => l.annotatorId === a2);
if (l1 && l2) {
shared1.push(String(l1.specificityLevel));
shared2.push(String(l2.specificityLevel));
}
}
if (shared1.length >= 2) {
const kappa = cohensKappa(shared1, shared2);
kappaSpec[i][j] = kappa;
kappaSpec[j][i] = kappa;
kappaSpecDetails.push({
a1: annotatorNames.get(a1) ?? a1,
a2: annotatorNames.get(a2) ?? a2,
kappa,
n: shared1.length,
});
}
}
}
// ─── Per-category agreement ───
const perCategory = perCategoryAgreement(
labels.map((l) => ({
category: l.contentCategory,
annotatorId: l.annotatorId,
paragraphId: l.paragraphId,
})),
CATEGORIES,
);
// ─── Per-stratum agreement (using stage1 data to identify strata) ───
const paragraphMeta = new Map(allParagraphs.map((p) => [p.id, p]));
// Classify each paragraph's stratum based on stage1 data
function classifyStratum(pid: string): string {
const para = paragraphMeta.get(pid);
if (!para) return "unknown";
const method = para.stage1Method;
const cat = para.stage1Category;
const spec = para.stage1Specificity;
// Check if it was a disputed paragraph based on method
if (method === "unresolved") return "unresolved";
if (method === "majority") {
// Try to identify the dispute type from the category
if (cat === "Management Role" || cat === "Risk Management Process")
return "mgmt_rmp_split";
if (cat === "None/Other" || cat === "Strategy Integration")
return "noneother_strategy_split";
if (cat === "Board Governance") return "board_mgmt_split";
if (spec === 3 || spec === 4) return "spec_34_split";
return "majority_other";
}
if (method === "unanimous") return "unanimous";
return "proportional_random";
}
const strataAgreement: Record<string, { total: number; agreed: number }> = {};
for (const [pid, lbls] of fullyLabeled) {
const stratum = classifyStratum(pid);
if (!strataAgreement[stratum]) {
strataAgreement[stratum] = { total: 0, agreed: 0 };
}
strataAgreement[stratum].total++;
const allSameCat = lbls.every(
(l) => l.contentCategory === lbls[0].contentCategory,
);
const allSameSpec = lbls.every(
(l) => l.specificityLevel === lbls[0].specificityLevel,
);
if (allSameCat && allSameSpec) strataAgreement[stratum].agreed++;
}
const strataRates: Record<string, { total: number; agreed: number; rate: number }> = {};
for (const [stratum, data] of Object.entries(strataAgreement)) {
strataRates[stratum] = {
...data,
rate: data.total > 0 ? data.agreed / data.total : 0,
};
}
// ─── Timing summary ───
const allActiveTimes = labels
.map((l) => l.activeMs)
.filter((t): t is number => t !== null);
const allWallTimes = labels
.map((l) => l.durationMs)
.filter((t): t is number => t !== null);
// ─── Category distribution ───
const categoryDist: Record<string, number> = {};
for (const cat of CATEGORIES) categoryDist[cat] = 0;
for (const l of labels) {
categoryDist[l.contentCategory] =
(categoryDist[l.contentCategory] ?? 0) + 1;
}
// ─── Specificity distribution ───
const specDist: Record<string, number> = { "1": 0, "2": 0, "3": 0, "4": 0 };
for (const l of labels) {
specDist[String(l.specificityLevel)] =
(specDist[String(l.specificityLevel)] ?? 0) + 1;
}
// ─── Majority label distribution (for fully-labeled paragraphs) ───
const majorityCategories: Record<string, number> = {};
for (const cat of CATEGORIES) majorityCategories[cat] = 0;
for (const lbls of fullyLabeled.values()) {
const catCounts = new Map<string, number>();
for (const l of lbls) {
catCounts.set(l.contentCategory, (catCounts.get(l.contentCategory) ?? 0) + 1);
}
let maxCount = 0;
let majorCat = "";
for (const [cat, count] of catCounts) {
if (count > maxCount) {
maxCount = count;
majorCat = cat;
}
}
if (majorCat) majorityCategories[majorCat]++;
}
const metrics = {
summary: {
totalLabels: labels.length,
totalParagraphs: allParagraphs.length,
fullyLabeledParagraphs: fullyLabeled.size,
adjudicatedParagraphs: allAdjudications.length,
annotatorCount: annotatorIds.length,
},
consensus: {
categoryOnly: round(categoryConsensusRate, 4),
specificityOnly: round(specConsensusRate, 4),
both: round(bothConsensusRate, 4),
},
krippendorffsAlpha: {
category: round(categoryAlpha, 4),
specificity: round(specAlpha, 4),
note: "Category alpha uses ordinal distance on nominal data (conservative). Specificity alpha uses ordinal distance.",
},
pairwiseKappa: {
category: {
annotators: annotatorIds.map((id) => annotatorNames.get(id) ?? id),
matrix: kappaCategory.map((row) => row.map((v) => round(v, 4))),
pairs: kappaCatDetails.map((d) => ({
...d,
kappa: round(d.kappa, 4),
})),
mean: round(
kappaCatDetails.length > 0
? kappaCatDetails.reduce((s, d) => s + d.kappa, 0) /
kappaCatDetails.length
: 0,
4,
),
},
specificity: {
annotators: annotatorIds.map((id) => annotatorNames.get(id) ?? id),
matrix: kappaSpec.map((row) => row.map((v) => round(v, 4))),
pairs: kappaSpecDetails.map((d) => ({
...d,
kappa: round(d.kappa, 4),
})),
mean: round(
kappaSpecDetails.length > 0
? kappaSpecDetails.reduce((s, d) => s + d.kappa, 0) /
kappaSpecDetails.length
: 0,
4,
),
},
},
perCategoryAgreement: Object.fromEntries(
Object.entries(perCategory).map(([k, v]) => [k, round(v, 4)]),
),
perStratumAgreement: strataRates,
distributions: {
categoryLabels: categoryDist,
specificityLabels: specDist,
majorityCategories,
},
timing: {
overallMedianActiveMs: allActiveTimes.length > 0 ? median(allActiveTimes) : null,
overallMeanActiveMs: allActiveTimes.length > 0 ? round(mean(allActiveTimes), 0) : null,
overallMedianDurationMs: allWallTimes.length > 0 ? median(allWallTimes) : null,
overallMeanDurationMs: allWallTimes.length > 0 ? round(mean(allWallTimes), 0) : null,
totalActiveHours:
allActiveTimes.length > 0
? round(sum(allActiveTimes) / 3_600_000, 2)
: null,
totalWallHours:
allWallTimes.length > 0
? round(sum(allWallTimes) / 3_600_000, 2)
: null,
labelsWithActiveTime: allActiveTimes.length,
labelsWithoutActiveTime: labels.length - allActiveTimes.length,
},
perAnnotator: perAnnotatorStats,
};
await writeFile(`${OUT_DIR}/metrics.json`, JSON.stringify(metrics, null, 2));
console.log(` metrics → metrics.json`);
// ── Print summary to console ──
console.log("\n" + "=".repeat(60));
console.log("HUMAN LABELING SUMMARY");
console.log("=".repeat(60));
console.log(`\nParagraphs: ${fullyLabeled.size} fully labeled / ${allParagraphs.length} total`);
console.log(`Labels: ${labels.length} total`);
console.log(`\n── Consensus Rates (3/3 agree) ──`);
console.log(` Category only: ${(categoryConsensusRate * 100).toFixed(1)}%`);
console.log(` Specificity only: ${(specConsensusRate * 100).toFixed(1)}%`);
console.log(` Both: ${(bothConsensusRate * 100).toFixed(1)}%`);
console.log(`\n── Krippendorff's Alpha ──`);
console.log(` Category: ${categoryAlpha.toFixed(4)}`);
console.log(` Specificity: ${specAlpha.toFixed(4)}`);
console.log(`\n── Pairwise Kappa (category) ──`);
console.log(` Mean: ${metrics.pairwiseKappa.category.mean}`);
for (const pair of kappaCatDetails) {
console.log(` ${pair.a1} × ${pair.a2}: ${pair.kappa.toFixed(4)} (n=${pair.n})`);
}
console.log(`\n── Pairwise Kappa (specificity) ──`);
console.log(` Mean: ${metrics.pairwiseKappa.specificity.mean}`);
for (const pair of kappaSpecDetails) {
console.log(` ${pair.a1} × ${pair.a2}: ${pair.kappa.toFixed(4)} (n=${pair.n})`);
}
console.log(`\n── Per-Category Agreement ──`);
for (const [cat, rate] of Object.entries(perCategory)) {
console.log(` ${cat}: ${(rate * 100).toFixed(1)}%`);
}
console.log(`\n── Per-Stratum Agreement ──`);
for (const [stratum, data] of Object.entries(strataRates)) {
console.log(
` ${stratum}: ${(data.rate * 100).toFixed(1)}% (${data.agreed}/${data.total})`,
);
}
console.log(`\n── Timing ──`);
if (allActiveTimes.length > 0) {
console.log(` Median active time: ${(median(allActiveTimes) / 1000).toFixed(1)}s`);
console.log(` Mean active time: ${(mean(allActiveTimes) / 1000).toFixed(1)}s`);
console.log(` Total active hours: ${(sum(allActiveTimes) / 3_600_000).toFixed(2)}h`);
console.log(` Total wall hours: ${(sum(allWallTimes) / 3_600_000).toFixed(2)}h`);
}
console.log(` Labels with active timer: ${allActiveTimes.length}/${labels.length}`);
console.log(`\n── Per-Annotator ──`);
for (const a of perAnnotatorStats) {
const activeH = a.totalActiveMs ? (a.totalActiveMs / 3_600_000).toFixed(2) : "N/A";
const medSec = a.medianActiveMs ? (a.medianActiveMs / 1000).toFixed(1) : "N/A";
console.log(
` ${a.name}: ${a.labelCount} labels, median ${medSec}s active, ${activeH}h total`,
);
}
console.log(`\n${"=".repeat(60)}`);
console.log(`All data exported to ${OUT_DIR}/`);
console.log("=".repeat(60));
process.exit(0);
}
function median(arr: number[]): number {
const sorted = [...arr].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 !== 0
? sorted[mid]
: (sorted[mid - 1] + sorted[mid]) / 2;
}
function mean(arr: number[]): number {
return arr.reduce((s, v) => s + v, 0) / arr.length;
}
function sum(arr: number[]): number {
return arr.reduce((s, v) => s + v, 0);
}
function round(n: number, decimals: number): number {
const factor = 10 ** decimals;
return Math.round(n * factor) / factor;
}
main().catch((err) => {
console.error("Dump failed:", err);
process.exit(1);
});