596 lines
21 KiB
TypeScript
596 lines
21 KiB
TypeScript
/**
|
||
* Comprehensive data dump from the labelapp database.
|
||
*
|
||
* Exports:
|
||
* data/gold/human-labels-raw.jsonl — every individual label with timing
|
||
* data/gold/paragraphs-holdout.jsonl — paragraph metadata for the 1,200 holdout
|
||
* data/gold/annotators.json — annotator profiles + onboarding timestamps
|
||
* data/gold/quiz-sessions.jsonl — all quiz attempts
|
||
* data/gold/metrics.json — comprehensive IRR: per-dimension alpha/kappa, pairwise matrices, per-category, per-stratum
|
||
*/
|
||
|
||
process.env.DATABASE_URL ??=
|
||
"postgresql://sec_cybert:sec_cybert@10.1.10.10:5432/sec_cybert";
|
||
|
||
import { writeFile, mkdir } from "node:fs/promises";
|
||
import { existsSync } from "node:fs";
|
||
import { db } from "../db";
|
||
import * as schema from "../db/schema";
|
||
import {
|
||
cohensKappa,
|
||
krippendorffsAlpha,
|
||
agreementRate,
|
||
perCategoryAgreement,
|
||
} from "../lib/metrics";
|
||
|
||
const OUT_DIR = "/home/joey/Documents/sec-cyBERT/data/gold";
|
||
|
||
const CATEGORIES = [
|
||
"Board Governance",
|
||
"Management Role",
|
||
"Risk Management Process",
|
||
"Third-Party Risk",
|
||
"Incident Disclosure",
|
||
"Strategy Integration",
|
||
"None/Other",
|
||
];
|
||
|
||
function toJSONL(records: object[]): string {
|
||
return records.map((r) => JSON.stringify(r)).join("\n") + "\n";
|
||
}
|
||
|
||
async function main() {
|
||
if (!existsSync(OUT_DIR)) await mkdir(OUT_DIR, { recursive: true });
|
||
|
||
// ── Load everything ──
|
||
console.log("Loading all data from database...");
|
||
const [allLabels, allAnnotators, allParagraphs, allQuizSessions, allAdjudications] =
|
||
await Promise.all([
|
||
db.select().from(schema.humanLabels),
|
||
db.select().from(schema.annotators),
|
||
db.select().from(schema.paragraphs),
|
||
db.select().from(schema.quizSessions),
|
||
db.select().from(schema.adjudications),
|
||
]);
|
||
|
||
const annotatorIds = allAnnotators.map((a) => a.id).sort();
|
||
const annotatorNames = new Map(allAnnotators.map((a) => [a.id, a.displayName]));
|
||
|
||
const labels = allLabels;
|
||
|
||
console.log(` ${labels.length} human labels`);
|
||
console.log(` ${allParagraphs.length} paragraphs`);
|
||
console.log(` ${allAnnotators.length} annotators`);
|
||
console.log(` ${allQuizSessions.length} quiz sessions`);
|
||
console.log(` ${allAdjudications.length} adjudications`);
|
||
|
||
// ── 1. Raw labels JSONL ──
|
||
console.log("\nExporting raw labels...");
|
||
const rawLabels = labels.map((l) => ({
|
||
paragraphId: l.paragraphId,
|
||
annotatorId: l.annotatorId,
|
||
annotatorName: annotatorNames.get(l.annotatorId) ?? l.annotatorId,
|
||
contentCategory: l.contentCategory,
|
||
specificityLevel: l.specificityLevel,
|
||
notes: l.notes,
|
||
labeledAt: l.labeledAt?.toISOString() ?? null,
|
||
sessionId: l.sessionId,
|
||
durationMs: l.durationMs,
|
||
activeMs: l.activeMs,
|
||
}));
|
||
await writeFile(`${OUT_DIR}/human-labels-raw.jsonl`, toJSONL(rawLabels));
|
||
console.log(` ${rawLabels.length} labels → human-labels-raw.jsonl`);
|
||
|
||
// ── 2. Paragraph metadata JSONL ──
|
||
console.log("\nExporting paragraph metadata...");
|
||
const paragraphRecords = allParagraphs.map((p) => ({
|
||
id: p.id,
|
||
text: p.text,
|
||
wordCount: p.wordCount,
|
||
paragraphIndex: p.paragraphIndex,
|
||
companyName: p.companyName,
|
||
cik: p.cik,
|
||
ticker: p.ticker,
|
||
filingType: p.filingType,
|
||
filingDate: p.filingDate,
|
||
fiscalYear: p.fiscalYear,
|
||
accessionNumber: p.accessionNumber,
|
||
secItem: p.secItem,
|
||
stage1Category: p.stage1Category,
|
||
stage1Specificity: p.stage1Specificity,
|
||
stage1Method: p.stage1Method,
|
||
stage1Confidence: p.stage1Confidence,
|
||
}));
|
||
await writeFile(`${OUT_DIR}/paragraphs-holdout.jsonl`, toJSONL(paragraphRecords));
|
||
console.log(` ${paragraphRecords.length} paragraphs → paragraphs-holdout.jsonl`);
|
||
|
||
// ── 3. Annotators JSON ──
|
||
console.log("\nExporting annotator profiles...");
|
||
const annotatorProfiles = allAnnotators.map((a: { id: string; displayName: string; onboardedAt: Date | null }) => ({
|
||
id: a.id,
|
||
displayName: a.displayName,
|
||
onboardedAt: a.onboardedAt?.toISOString() ?? null,
|
||
}));
|
||
await writeFile(`${OUT_DIR}/annotators.json`, JSON.stringify(annotatorProfiles, null, 2));
|
||
console.log(` ${annotatorProfiles.length} annotators → annotators.json`);
|
||
|
||
// ── 4. Quiz sessions JSONL ──
|
||
console.log("\nExporting quiz sessions...");
|
||
const quizRecords = allQuizSessions.map((q) => ({
|
||
id: q.id,
|
||
annotatorId: q.annotatorId,
|
||
annotatorName: annotatorNames.get(q.annotatorId) ?? q.annotatorId,
|
||
startedAt: q.startedAt?.toISOString() ?? null,
|
||
completedAt: q.completedAt?.toISOString() ?? null,
|
||
passed: q.passed,
|
||
score: q.score,
|
||
totalQuestions: q.totalQuestions,
|
||
answers: q.answers,
|
||
}));
|
||
await writeFile(`${OUT_DIR}/quiz-sessions.jsonl`, toJSONL(quizRecords));
|
||
console.log(` ${quizRecords.length} quiz sessions → quiz-sessions.jsonl`);
|
||
|
||
// ── 5. Comprehensive metrics ──
|
||
console.log("\nComputing metrics...");
|
||
|
||
// Group labels by paragraph
|
||
const byParagraph = new Map<string, typeof labels>();
|
||
for (const label of labels) {
|
||
const group = byParagraph.get(label.paragraphId);
|
||
if (group) group.push(label);
|
||
else byParagraph.set(label.paragraphId, [label]);
|
||
}
|
||
|
||
// Only paragraphs with 3+ labels
|
||
const fullyLabeled = new Map<string, typeof labels>();
|
||
for (const [pid, lbls] of byParagraph) {
|
||
if (lbls.length >= 3) fullyLabeled.set(pid, lbls);
|
||
}
|
||
|
||
// Paragraphs with 2+ labels (for pairwise)
|
||
const multiLabeled = new Map<string, typeof labels>();
|
||
for (const [pid, lbls] of byParagraph) {
|
||
if (lbls.length >= 2) multiLabeled.set(pid, lbls);
|
||
}
|
||
|
||
const multiLabeledParaIds = [...multiLabeled.keys()];
|
||
|
||
// ─── Per-annotator stats ───
|
||
const perAnnotatorStats = annotatorIds.map((aid) => {
|
||
const myLabels = labels.filter((l) => l.annotatorId === aid);
|
||
const activeTimes = myLabels
|
||
.map((l) => l.activeMs)
|
||
.filter((t): t is number => t !== null);
|
||
const wallTimes = myLabels
|
||
.map((l) => l.durationMs)
|
||
.filter((t): t is number => t !== null);
|
||
return {
|
||
id: aid,
|
||
name: annotatorNames.get(aid) ?? aid,
|
||
labelCount: myLabels.length,
|
||
medianActiveMs: activeTimes.length > 0 ? median(activeTimes) : null,
|
||
meanActiveMs: activeTimes.length > 0 ? mean(activeTimes) : null,
|
||
medianDurationMs: wallTimes.length > 0 ? median(wallTimes) : null,
|
||
meanDurationMs: wallTimes.length > 0 ? mean(wallTimes) : null,
|
||
totalActiveMs: activeTimes.length > 0 ? sum(activeTimes) : null,
|
||
totalDurationMs: wallTimes.length > 0 ? sum(wallTimes) : null,
|
||
labelsWithActiveTime: activeTimes.length,
|
||
};
|
||
});
|
||
|
||
// ─── Category consensus ───
|
||
const categoryArrays: string[][] = [];
|
||
for (const lbls of fullyLabeled.values()) {
|
||
categoryArrays.push(lbls.map((l) => l.contentCategory));
|
||
}
|
||
const categoryConsensusRate = agreementRate(categoryArrays);
|
||
|
||
// ─── Specificity consensus ───
|
||
const specArrays: string[][] = [];
|
||
for (const lbls of fullyLabeled.values()) {
|
||
specArrays.push(lbls.map((l) => String(l.specificityLevel)));
|
||
}
|
||
const specConsensusRate = agreementRate(specArrays);
|
||
|
||
// ─── Both consensus ───
|
||
const bothArrays: string[][] = [];
|
||
for (const lbls of fullyLabeled.values()) {
|
||
bothArrays.push(
|
||
lbls.map((l) => `${l.contentCategory}|${l.specificityLevel}`),
|
||
);
|
||
}
|
||
const bothConsensusRate = agreementRate(bothArrays);
|
||
|
||
// ─── Krippendorff's Alpha: category (nominal, use ordinal distance = 0/1) ───
|
||
// We encode categories as integers for alpha computation
|
||
const catIndex = new Map(CATEGORIES.map((c, i) => [c, i + 1]));
|
||
|
||
const categoryRatingsMatrix: (number | null)[][] = annotatorIds.map(
|
||
(annotatorId) =>
|
||
multiLabeledParaIds.map((paraId) => {
|
||
const label = multiLabeled
|
||
.get(paraId)
|
||
?.find((l) => l.annotatorId === annotatorId);
|
||
if (!label) return null;
|
||
return catIndex.get(label.contentCategory) ?? null;
|
||
}),
|
||
);
|
||
|
||
// Krippendorff's alpha for category (note: using ordinal distance on nominal data
|
||
// — this is conservative; nominal distance would give higher alpha)
|
||
const categoryAlpha =
|
||
annotatorIds.length >= 2 && multiLabeledParaIds.length > 0
|
||
? krippendorffsAlpha(categoryRatingsMatrix)
|
||
: 0;
|
||
|
||
// ─── Krippendorff's Alpha: specificity (ordinal) ───
|
||
const specRatingsMatrix: (number | null)[][] = annotatorIds.map(
|
||
(annotatorId) =>
|
||
multiLabeledParaIds.map((paraId) => {
|
||
const label = multiLabeled
|
||
.get(paraId)
|
||
?.find((l) => l.annotatorId === annotatorId);
|
||
return label?.specificityLevel ?? null;
|
||
}),
|
||
);
|
||
|
||
const specAlpha =
|
||
annotatorIds.length >= 2 && multiLabeledParaIds.length > 0
|
||
? krippendorffsAlpha(specRatingsMatrix)
|
||
: 0;
|
||
|
||
// ─── Pairwise Cohen's Kappa — category ───
|
||
const kappaCategory: number[][] = Array.from(
|
||
{ length: annotatorIds.length },
|
||
() => new Array(annotatorIds.length).fill(0),
|
||
);
|
||
const kappaCatDetails: {
|
||
a1: string;
|
||
a2: string;
|
||
kappa: number;
|
||
n: number;
|
||
}[] = [];
|
||
|
||
for (let i = 0; i < annotatorIds.length; i++) {
|
||
kappaCategory[i][i] = 1;
|
||
for (let j = i + 1; j < annotatorIds.length; j++) {
|
||
const a1 = annotatorIds[i];
|
||
const a2 = annotatorIds[j];
|
||
const shared1: string[] = [];
|
||
const shared2: string[] = [];
|
||
|
||
for (const [, lbls] of multiLabeled) {
|
||
const l1 = lbls.find((l) => l.annotatorId === a1);
|
||
const l2 = lbls.find((l) => l.annotatorId === a2);
|
||
if (l1 && l2) {
|
||
shared1.push(l1.contentCategory);
|
||
shared2.push(l2.contentCategory);
|
||
}
|
||
}
|
||
|
||
if (shared1.length >= 2) {
|
||
const kappa = cohensKappa(shared1, shared2);
|
||
kappaCategory[i][j] = kappa;
|
||
kappaCategory[j][i] = kappa;
|
||
kappaCatDetails.push({
|
||
a1: annotatorNames.get(a1) ?? a1,
|
||
a2: annotatorNames.get(a2) ?? a2,
|
||
kappa,
|
||
n: shared1.length,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── Pairwise Cohen's Kappa — specificity ───
|
||
const kappaSpec: number[][] = Array.from(
|
||
{ length: annotatorIds.length },
|
||
() => new Array(annotatorIds.length).fill(0),
|
||
);
|
||
const kappaSpecDetails: {
|
||
a1: string;
|
||
a2: string;
|
||
kappa: number;
|
||
n: number;
|
||
}[] = [];
|
||
|
||
for (let i = 0; i < annotatorIds.length; i++) {
|
||
kappaSpec[i][i] = 1;
|
||
for (let j = i + 1; j < annotatorIds.length; j++) {
|
||
const a1 = annotatorIds[i];
|
||
const a2 = annotatorIds[j];
|
||
const shared1: string[] = [];
|
||
const shared2: string[] = [];
|
||
|
||
for (const [, lbls] of multiLabeled) {
|
||
const l1 = lbls.find((l) => l.annotatorId === a1);
|
||
const l2 = lbls.find((l) => l.annotatorId === a2);
|
||
if (l1 && l2) {
|
||
shared1.push(String(l1.specificityLevel));
|
||
shared2.push(String(l2.specificityLevel));
|
||
}
|
||
}
|
||
|
||
if (shared1.length >= 2) {
|
||
const kappa = cohensKappa(shared1, shared2);
|
||
kappaSpec[i][j] = kappa;
|
||
kappaSpec[j][i] = kappa;
|
||
kappaSpecDetails.push({
|
||
a1: annotatorNames.get(a1) ?? a1,
|
||
a2: annotatorNames.get(a2) ?? a2,
|
||
kappa,
|
||
n: shared1.length,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── Per-category agreement ───
|
||
const perCategory = perCategoryAgreement(
|
||
labels.map((l) => ({
|
||
category: l.contentCategory,
|
||
annotatorId: l.annotatorId,
|
||
paragraphId: l.paragraphId,
|
||
})),
|
||
CATEGORIES,
|
||
);
|
||
|
||
// ─── Per-stratum agreement (using stage1 data to identify strata) ───
|
||
const paragraphMeta = new Map(allParagraphs.map((p) => [p.id, p]));
|
||
|
||
// Classify each paragraph's stratum based on stage1 data
|
||
function classifyStratum(pid: string): string {
|
||
const para = paragraphMeta.get(pid);
|
||
if (!para) return "unknown";
|
||
const method = para.stage1Method;
|
||
const cat = para.stage1Category;
|
||
const spec = para.stage1Specificity;
|
||
|
||
// Check if it was a disputed paragraph based on method
|
||
if (method === "unresolved") return "unresolved";
|
||
if (method === "majority") {
|
||
// Try to identify the dispute type from the category
|
||
if (cat === "Management Role" || cat === "Risk Management Process")
|
||
return "mgmt_rmp_split";
|
||
if (cat === "None/Other" || cat === "Strategy Integration")
|
||
return "noneother_strategy_split";
|
||
if (cat === "Board Governance") return "board_mgmt_split";
|
||
if (spec === 3 || spec === 4) return "spec_34_split";
|
||
return "majority_other";
|
||
}
|
||
if (method === "unanimous") return "unanimous";
|
||
return "proportional_random";
|
||
}
|
||
|
||
const strataAgreement: Record<string, { total: number; agreed: number }> = {};
|
||
for (const [pid, lbls] of fullyLabeled) {
|
||
const stratum = classifyStratum(pid);
|
||
if (!strataAgreement[stratum]) {
|
||
strataAgreement[stratum] = { total: 0, agreed: 0 };
|
||
}
|
||
strataAgreement[stratum].total++;
|
||
const allSameCat = lbls.every(
|
||
(l) => l.contentCategory === lbls[0].contentCategory,
|
||
);
|
||
const allSameSpec = lbls.every(
|
||
(l) => l.specificityLevel === lbls[0].specificityLevel,
|
||
);
|
||
if (allSameCat && allSameSpec) strataAgreement[stratum].agreed++;
|
||
}
|
||
|
||
const strataRates: Record<string, { total: number; agreed: number; rate: number }> = {};
|
||
for (const [stratum, data] of Object.entries(strataAgreement)) {
|
||
strataRates[stratum] = {
|
||
...data,
|
||
rate: data.total > 0 ? data.agreed / data.total : 0,
|
||
};
|
||
}
|
||
|
||
// ─── Timing summary ───
|
||
const allActiveTimes = labels
|
||
.map((l) => l.activeMs)
|
||
.filter((t): t is number => t !== null);
|
||
const allWallTimes = labels
|
||
.map((l) => l.durationMs)
|
||
.filter((t): t is number => t !== null);
|
||
|
||
// ─── Category distribution ───
|
||
const categoryDist: Record<string, number> = {};
|
||
for (const cat of CATEGORIES) categoryDist[cat] = 0;
|
||
for (const l of labels) {
|
||
categoryDist[l.contentCategory] =
|
||
(categoryDist[l.contentCategory] ?? 0) + 1;
|
||
}
|
||
|
||
// ─── Specificity distribution ───
|
||
const specDist: Record<string, number> = { "1": 0, "2": 0, "3": 0, "4": 0 };
|
||
for (const l of labels) {
|
||
specDist[String(l.specificityLevel)] =
|
||
(specDist[String(l.specificityLevel)] ?? 0) + 1;
|
||
}
|
||
|
||
// ─── Majority label distribution (for fully-labeled paragraphs) ───
|
||
const majorityCategories: Record<string, number> = {};
|
||
for (const cat of CATEGORIES) majorityCategories[cat] = 0;
|
||
|
||
for (const lbls of fullyLabeled.values()) {
|
||
const catCounts = new Map<string, number>();
|
||
for (const l of lbls) {
|
||
catCounts.set(l.contentCategory, (catCounts.get(l.contentCategory) ?? 0) + 1);
|
||
}
|
||
let maxCount = 0;
|
||
let majorCat = "";
|
||
for (const [cat, count] of catCounts) {
|
||
if (count > maxCount) {
|
||
maxCount = count;
|
||
majorCat = cat;
|
||
}
|
||
}
|
||
if (majorCat) majorityCategories[majorCat]++;
|
||
}
|
||
|
||
const metrics = {
|
||
summary: {
|
||
totalLabels: labels.length,
|
||
totalParagraphs: allParagraphs.length,
|
||
fullyLabeledParagraphs: fullyLabeled.size,
|
||
adjudicatedParagraphs: allAdjudications.length,
|
||
annotatorCount: annotatorIds.length,
|
||
},
|
||
consensus: {
|
||
categoryOnly: round(categoryConsensusRate, 4),
|
||
specificityOnly: round(specConsensusRate, 4),
|
||
both: round(bothConsensusRate, 4),
|
||
},
|
||
krippendorffsAlpha: {
|
||
category: round(categoryAlpha, 4),
|
||
specificity: round(specAlpha, 4),
|
||
note: "Category alpha uses ordinal distance on nominal data (conservative). Specificity alpha uses ordinal distance.",
|
||
},
|
||
pairwiseKappa: {
|
||
category: {
|
||
annotators: annotatorIds.map((id) => annotatorNames.get(id) ?? id),
|
||
matrix: kappaCategory.map((row) => row.map((v) => round(v, 4))),
|
||
pairs: kappaCatDetails.map((d) => ({
|
||
...d,
|
||
kappa: round(d.kappa, 4),
|
||
})),
|
||
mean: round(
|
||
kappaCatDetails.length > 0
|
||
? kappaCatDetails.reduce((s, d) => s + d.kappa, 0) /
|
||
kappaCatDetails.length
|
||
: 0,
|
||
4,
|
||
),
|
||
},
|
||
specificity: {
|
||
annotators: annotatorIds.map((id) => annotatorNames.get(id) ?? id),
|
||
matrix: kappaSpec.map((row) => row.map((v) => round(v, 4))),
|
||
pairs: kappaSpecDetails.map((d) => ({
|
||
...d,
|
||
kappa: round(d.kappa, 4),
|
||
})),
|
||
mean: round(
|
||
kappaSpecDetails.length > 0
|
||
? kappaSpecDetails.reduce((s, d) => s + d.kappa, 0) /
|
||
kappaSpecDetails.length
|
||
: 0,
|
||
4,
|
||
),
|
||
},
|
||
},
|
||
perCategoryAgreement: Object.fromEntries(
|
||
Object.entries(perCategory).map(([k, v]) => [k, round(v, 4)]),
|
||
),
|
||
perStratumAgreement: strataRates,
|
||
distributions: {
|
||
categoryLabels: categoryDist,
|
||
specificityLabels: specDist,
|
||
majorityCategories,
|
||
},
|
||
timing: {
|
||
overallMedianActiveMs: allActiveTimes.length > 0 ? median(allActiveTimes) : null,
|
||
overallMeanActiveMs: allActiveTimes.length > 0 ? round(mean(allActiveTimes), 0) : null,
|
||
overallMedianDurationMs: allWallTimes.length > 0 ? median(allWallTimes) : null,
|
||
overallMeanDurationMs: allWallTimes.length > 0 ? round(mean(allWallTimes), 0) : null,
|
||
totalActiveHours:
|
||
allActiveTimes.length > 0
|
||
? round(sum(allActiveTimes) / 3_600_000, 2)
|
||
: null,
|
||
totalWallHours:
|
||
allWallTimes.length > 0
|
||
? round(sum(allWallTimes) / 3_600_000, 2)
|
||
: null,
|
||
labelsWithActiveTime: allActiveTimes.length,
|
||
labelsWithoutActiveTime: labels.length - allActiveTimes.length,
|
||
},
|
||
perAnnotator: perAnnotatorStats,
|
||
};
|
||
|
||
await writeFile(`${OUT_DIR}/metrics.json`, JSON.stringify(metrics, null, 2));
|
||
console.log(` metrics → metrics.json`);
|
||
|
||
// ── Print summary to console ──
|
||
console.log("\n" + "=".repeat(60));
|
||
console.log("HUMAN LABELING SUMMARY");
|
||
console.log("=".repeat(60));
|
||
console.log(`\nParagraphs: ${fullyLabeled.size} fully labeled / ${allParagraphs.length} total`);
|
||
console.log(`Labels: ${labels.length} total`);
|
||
console.log(`\n── Consensus Rates (3/3 agree) ──`);
|
||
console.log(` Category only: ${(categoryConsensusRate * 100).toFixed(1)}%`);
|
||
console.log(` Specificity only: ${(specConsensusRate * 100).toFixed(1)}%`);
|
||
console.log(` Both: ${(bothConsensusRate * 100).toFixed(1)}%`);
|
||
console.log(`\n── Krippendorff's Alpha ──`);
|
||
console.log(` Category: ${categoryAlpha.toFixed(4)}`);
|
||
console.log(` Specificity: ${specAlpha.toFixed(4)}`);
|
||
console.log(`\n── Pairwise Kappa (category) ──`);
|
||
console.log(` Mean: ${metrics.pairwiseKappa.category.mean}`);
|
||
for (const pair of kappaCatDetails) {
|
||
console.log(` ${pair.a1} × ${pair.a2}: ${pair.kappa.toFixed(4)} (n=${pair.n})`);
|
||
}
|
||
console.log(`\n── Pairwise Kappa (specificity) ──`);
|
||
console.log(` Mean: ${metrics.pairwiseKappa.specificity.mean}`);
|
||
for (const pair of kappaSpecDetails) {
|
||
console.log(` ${pair.a1} × ${pair.a2}: ${pair.kappa.toFixed(4)} (n=${pair.n})`);
|
||
}
|
||
console.log(`\n── Per-Category Agreement ──`);
|
||
for (const [cat, rate] of Object.entries(perCategory)) {
|
||
console.log(` ${cat}: ${(rate * 100).toFixed(1)}%`);
|
||
}
|
||
console.log(`\n── Per-Stratum Agreement ──`);
|
||
for (const [stratum, data] of Object.entries(strataRates)) {
|
||
console.log(
|
||
` ${stratum}: ${(data.rate * 100).toFixed(1)}% (${data.agreed}/${data.total})`,
|
||
);
|
||
}
|
||
console.log(`\n── Timing ──`);
|
||
if (allActiveTimes.length > 0) {
|
||
console.log(` Median active time: ${(median(allActiveTimes) / 1000).toFixed(1)}s`);
|
||
console.log(` Mean active time: ${(mean(allActiveTimes) / 1000).toFixed(1)}s`);
|
||
console.log(` Total active hours: ${(sum(allActiveTimes) / 3_600_000).toFixed(2)}h`);
|
||
console.log(` Total wall hours: ${(sum(allWallTimes) / 3_600_000).toFixed(2)}h`);
|
||
}
|
||
console.log(` Labels with active timer: ${allActiveTimes.length}/${labels.length}`);
|
||
|
||
console.log(`\n── Per-Annotator ──`);
|
||
for (const a of perAnnotatorStats) {
|
||
const activeH = a.totalActiveMs ? (a.totalActiveMs / 3_600_000).toFixed(2) : "N/A";
|
||
const medSec = a.medianActiveMs ? (a.medianActiveMs / 1000).toFixed(1) : "N/A";
|
||
console.log(
|
||
` ${a.name}: ${a.labelCount} labels, median ${medSec}s active, ${activeH}h total`,
|
||
);
|
||
}
|
||
|
||
console.log(`\n${"=".repeat(60)}`);
|
||
console.log(`All data exported to ${OUT_DIR}/`);
|
||
console.log("=".repeat(60));
|
||
|
||
process.exit(0);
|
||
}
|
||
|
||
function median(arr: number[]): number {
|
||
const sorted = [...arr].sort((a, b) => a - b);
|
||
const mid = Math.floor(sorted.length / 2);
|
||
return sorted.length % 2 !== 0
|
||
? sorted[mid]
|
||
: (sorted[mid - 1] + sorted[mid]) / 2;
|
||
}
|
||
|
||
function mean(arr: number[]): number {
|
||
return arr.reduce((s, v) => s + v, 0) / arr.length;
|
||
}
|
||
|
||
function sum(arr: number[]): number {
|
||
return arr.reduce((s, v) => s + v, 0);
|
||
}
|
||
|
||
function round(n: number, decimals: number): number {
|
||
const factor = 10 ** decimals;
|
||
return Math.round(n * factor) / factor;
|
||
}
|
||
|
||
main().catch((err) => {
|
||
console.error("Dump failed:", err);
|
||
process.exit(1);
|
||
});
|