2026-03-29 00:32:24 -04:00

100 lines
2.9 KiB
TypeScript

process.env.DATABASE_URL ??=
"postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert";
import { writeFile, mkdir } from "node:fs/promises";
import { existsSync } from "node:fs";
import { dirname } from "node:path";
import { db } from "../db";
import * as schema from "../db/schema";
const OUTPUT_PATH =
"/home/joey/Documents/sec-cyBERT/data/gold/gold-labels.jsonl";
async function main() {
// 1. Load all adjudicated paragraphs
console.log("Loading adjudications...");
const adjudications = await db.select().from(schema.adjudications);
console.log(` ${adjudications.length} adjudicated paragraphs`);
if (adjudications.length === 0) {
console.log("No adjudications found. Nothing to export.");
process.exit(0);
}
// 2. Load all human labels for adjudicated paragraphs
console.log("Loading human labels...");
const adjudicatedIds = new Set(adjudications.map((a) => a.paragraphId));
const allHumanLabels = await db.select().from(schema.humanLabels);
const relevantLabels = allHumanLabels.filter((l) =>
adjudicatedIds.has(l.paragraphId),
);
console.log(
` ${relevantLabels.length} human labels for adjudicated paragraphs`,
);
// Group human labels by paragraph
const labelsByParagraph = new Map<
string,
(typeof relevantLabels)[number][]
>();
for (const label of relevantLabels) {
const group = labelsByParagraph.get(label.paragraphId);
if (group) {
group.push(label);
} else {
labelsByParagraph.set(label.paragraphId, [label]);
}
}
// 3. Build GoldLabel records
const goldLabels: object[] = [];
for (const adj of adjudications) {
const humanLabels = (labelsByParagraph.get(adj.paragraphId) ?? []).map(
(hl) => ({
paragraphId: hl.paragraphId,
annotatorId: hl.annotatorId,
label: {
content_category: hl.contentCategory,
specificity_level: hl.specificityLevel,
category_confidence: "high",
specificity_confidence: "high",
reasoning: hl.notes ?? "",
},
labeledAt: hl.labeledAt?.toISOString() ?? new Date().toISOString(),
notes: hl.notes ?? undefined,
}),
);
const goldLabel = {
paragraphId: adj.paragraphId,
finalLabel: {
content_category: adj.finalCategory,
specificity_level: adj.finalSpecificity,
category_confidence: "high",
specificity_confidence: "high",
reasoning: adj.notes ?? "",
},
adjudicationMethod: adj.method,
humanLabels,
};
goldLabels.push(goldLabel);
}
// 4. Write JSONL
const dir = dirname(OUTPUT_PATH);
if (!existsSync(dir)) await mkdir(dir, { recursive: true });
const content = goldLabels.map((r) => JSON.stringify(r)).join("\n") + "\n";
await writeFile(OUTPUT_PATH, content);
console.log(`\nExported ${goldLabels.length} gold labels to ${OUTPUT_PATH}`);
process.exit(0);
}
main().catch((err) => {
console.error("Export failed:", err);
process.exit(1);
});