process.env.DATABASE_URL ??= "postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert"; import { writeFile, mkdir } from "node:fs/promises"; import { existsSync } from "node:fs"; import { dirname } from "node:path"; import { db } from "../db"; import * as schema from "../db/schema"; const OUTPUT_PATH = "/home/joey/Documents/sec-cyBERT/data/gold/gold-labels.jsonl"; async function main() { // 1. Load all adjudicated paragraphs console.log("Loading adjudications..."); const adjudications = await db.select().from(schema.adjudications); console.log(` ${adjudications.length} adjudicated paragraphs`); if (adjudications.length === 0) { console.log("No adjudications found. Nothing to export."); process.exit(0); } // 2. Load all human labels for adjudicated paragraphs console.log("Loading human labels..."); const adjudicatedIds = new Set(adjudications.map((a) => a.paragraphId)); const allHumanLabels = await db.select().from(schema.humanLabels); const relevantLabels = allHumanLabels.filter((l) => adjudicatedIds.has(l.paragraphId), ); console.log( ` ${relevantLabels.length} human labels for adjudicated paragraphs`, ); // Group human labels by paragraph const labelsByParagraph = new Map< string, (typeof relevantLabels)[number][] >(); for (const label of relevantLabels) { const group = labelsByParagraph.get(label.paragraphId); if (group) { group.push(label); } else { labelsByParagraph.set(label.paragraphId, [label]); } } // 3. Build GoldLabel records const goldLabels: object[] = []; for (const adj of adjudications) { const humanLabels = (labelsByParagraph.get(adj.paragraphId) ?? []).map( (hl) => ({ paragraphId: hl.paragraphId, annotatorId: hl.annotatorId, label: { content_category: hl.contentCategory, specificity_level: hl.specificityLevel, category_confidence: "high", specificity_confidence: "high", reasoning: hl.notes ?? "", }, labeledAt: hl.labeledAt?.toISOString() ?? new Date().toISOString(), notes: hl.notes ?? undefined, }), ); const goldLabel = { paragraphId: adj.paragraphId, finalLabel: { content_category: adj.finalCategory, specificity_level: adj.finalSpecificity, category_confidence: "high", specificity_confidence: "high", reasoning: adj.notes ?? "", }, adjudicationMethod: adj.method, humanLabels, }; goldLabels.push(goldLabel); } // 4. Write JSONL const dir = dirname(OUTPUT_PATH); if (!existsSync(dir)) await mkdir(dir, { recursive: true }); const content = goldLabels.map((r) => JSON.stringify(r)).join("\n") + "\n"; await writeFile(OUTPUT_PATH, content); console.log(`\nExported ${goldLabels.length} gold labels to ${OUTPUT_PATH}`); process.exit(0); } main().catch((err) => { console.error("Export failed:", err); process.exit(1); });