100 lines
2.9 KiB
TypeScript
100 lines
2.9 KiB
TypeScript
process.env.DATABASE_URL ??=
|
|
"postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert";
|
|
|
|
import { writeFile, mkdir } from "node:fs/promises";
|
|
import { existsSync } from "node:fs";
|
|
import { dirname } from "node:path";
|
|
import { db } from "../db";
|
|
import * as schema from "../db/schema";
|
|
|
|
const OUTPUT_PATH =
|
|
"/home/joey/Documents/sec-cyBERT/data/gold/gold-labels.jsonl";
|
|
|
|
async function main() {
|
|
// 1. Load all adjudicated paragraphs
|
|
console.log("Loading adjudications...");
|
|
const adjudications = await db.select().from(schema.adjudications);
|
|
console.log(` ${adjudications.length} adjudicated paragraphs`);
|
|
|
|
if (adjudications.length === 0) {
|
|
console.log("No adjudications found. Nothing to export.");
|
|
process.exit(0);
|
|
}
|
|
|
|
// 2. Load all human labels for adjudicated paragraphs
|
|
console.log("Loading human labels...");
|
|
const adjudicatedIds = new Set(adjudications.map((a) => a.paragraphId));
|
|
const allHumanLabels = await db.select().from(schema.humanLabels);
|
|
const relevantLabels = allHumanLabels.filter((l) =>
|
|
adjudicatedIds.has(l.paragraphId),
|
|
);
|
|
console.log(
|
|
` ${relevantLabels.length} human labels for adjudicated paragraphs`,
|
|
);
|
|
|
|
// Group human labels by paragraph
|
|
const labelsByParagraph = new Map<
|
|
string,
|
|
(typeof relevantLabels)[number][]
|
|
>();
|
|
for (const label of relevantLabels) {
|
|
const group = labelsByParagraph.get(label.paragraphId);
|
|
if (group) {
|
|
group.push(label);
|
|
} else {
|
|
labelsByParagraph.set(label.paragraphId, [label]);
|
|
}
|
|
}
|
|
|
|
// 3. Build GoldLabel records
|
|
const goldLabels: object[] = [];
|
|
|
|
for (const adj of adjudications) {
|
|
const humanLabels = (labelsByParagraph.get(adj.paragraphId) ?? []).map(
|
|
(hl) => ({
|
|
paragraphId: hl.paragraphId,
|
|
annotatorId: hl.annotatorId,
|
|
label: {
|
|
content_category: hl.contentCategory,
|
|
specificity_level: hl.specificityLevel,
|
|
category_confidence: "high",
|
|
specificity_confidence: "high",
|
|
reasoning: hl.notes ?? "",
|
|
},
|
|
labeledAt: hl.labeledAt?.toISOString() ?? new Date().toISOString(),
|
|
notes: hl.notes ?? undefined,
|
|
}),
|
|
);
|
|
|
|
const goldLabel = {
|
|
paragraphId: adj.paragraphId,
|
|
finalLabel: {
|
|
content_category: adj.finalCategory,
|
|
specificity_level: adj.finalSpecificity,
|
|
category_confidence: "high",
|
|
specificity_confidence: "high",
|
|
reasoning: adj.notes ?? "",
|
|
},
|
|
adjudicationMethod: adj.method,
|
|
humanLabels,
|
|
};
|
|
|
|
goldLabels.push(goldLabel);
|
|
}
|
|
|
|
// 4. Write JSONL
|
|
const dir = dirname(OUTPUT_PATH);
|
|
if (!existsSync(dir)) await mkdir(dir, { recursive: true });
|
|
|
|
const content = goldLabels.map((r) => JSON.stringify(r)).join("\n") + "\n";
|
|
await writeFile(OUTPUT_PATH, content);
|
|
|
|
console.log(`\nExported ${goldLabels.length} gold labels to ${OUTPUT_PATH}`);
|
|
process.exit(0);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Export failed:", err);
|
|
process.exit(1);
|
|
});
|