process.env.DATABASE_URL ??= "postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert"; import { readFile } from "node:fs/promises"; import { db } from "../db"; import * as schema from "../db/schema"; async function readJsonl(path: string): Promise { const text = await readFile(path, "utf-8"); return text .split("\n") .filter((l) => l.trim()) .map((l) => JSON.parse(l) as T); } interface ParagraphRow { id: string; text: string; textHash: string; wordCount: number; paragraphIndex: number; filing: { companyName: string; cik: string; ticker: string; filingType: string; filingDate: string; fiscalYear: number; accessionNumber: string; secItem: string; }; } interface AnnotationRow { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string; reasoning: string; }; provenance: Record; } function computeConsensus(annotations: AnnotationRow[]): { category: string; specificity: number; method: string; confidence: number; } { // Majority vote for category const catCounts = new Map(); for (const a of annotations) { const cat = a.label.content_category; catCounts.set(cat, (catCounts.get(cat) ?? 0) + 1); } let maxCatCount = 0; let majorityCategory = ""; for (const [cat, count] of catCounts) { if (count > maxCatCount) { maxCatCount = count; majorityCategory = cat; } } // Majority vote for specificity const specCounts = new Map(); for (const a of annotations) { const spec = a.label.specificity_level; specCounts.set(spec, (specCounts.get(spec) ?? 0) + 1); } let maxSpecCount = 0; let majoritySpecificity = 0; for (const [spec, count] of specCounts) { if (count > maxSpecCount) { maxSpecCount = count; majoritySpecificity = spec; } } const total = annotations.length; const allAgreeCategory = maxCatCount === total; const allAgreeSpecificity = maxSpecCount === total; const method = allAgreeCategory && allAgreeSpecificity ? "unanimous" : "majority"; // Confidence = fraction of annotators that agreed with majority on both const agreedOnBoth = annotations.filter( (a) => a.label.content_category === majorityCategory && a.label.specificity_level === majoritySpecificity, ).length; const confidence = agreedOnBoth / total; return { category: majorityCategory, specificity: majoritySpecificity, method, confidence, }; } async function main() { const PARAGRAPHS_PATH = "/home/joey/Documents/sec-cyBERT/data/paragraphs/paragraphs-clean.jsonl"; const ANNOTATIONS_PATH = "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl"; // 1. Read annotations and compute consensus per paragraph console.log("Reading annotations..."); const annotations = await readJsonl(ANNOTATIONS_PATH); console.log(` ${annotations.length} annotations loaded`); const annotationsByParagraph = new Map(); for (const a of annotations) { const group = annotationsByParagraph.get(a.paragraphId); if (group) { group.push(a); } else { annotationsByParagraph.set(a.paragraphId, [a]); } } console.log( ` ${annotationsByParagraph.size} paragraphs have annotations`, ); const consensusMap = new Map< string, ReturnType >(); for (const [pid, anns] of annotationsByParagraph) { consensusMap.set(pid, computeConsensus(anns)); } // 2. Read paragraphs and insert in batches console.log("Reading paragraphs..."); const paragraphs = await readJsonl(PARAGRAPHS_PATH); console.log(` ${paragraphs.length} paragraphs loaded`); const BATCH_SIZE = 1000; for (let i = 0; i < paragraphs.length; i += BATCH_SIZE) { const batch = paragraphs.slice(i, i + BATCH_SIZE); const rows = batch.map((p) => { const consensus = consensusMap.get(p.id); return { id: p.id, text: p.text, wordCount: p.wordCount, paragraphIndex: p.paragraphIndex, companyName: p.filing.companyName, cik: p.filing.cik, ticker: p.filing.ticker || null, filingType: p.filing.filingType, filingDate: p.filing.filingDate, fiscalYear: p.filing.fiscalYear, accessionNumber: p.filing.accessionNumber, secItem: p.filing.secItem, stage1Category: consensus?.category ?? null, stage1Specificity: consensus?.specificity ?? null, stage1Method: consensus?.method ?? null, stage1Confidence: consensus?.confidence ?? null, }; }); await db .insert(schema.paragraphs) .values(rows) .onConflictDoNothing(); const progress = Math.min(i + BATCH_SIZE, paragraphs.length); console.log(` Inserted ${progress}/${paragraphs.length} paragraphs`); } // 3. Create annotator accounts console.log("Creating annotator accounts..."); const annotatorAccounts = [ { id: "aaryan", displayName: "Aaryan", password: "sec-cybert" }, { id: "anuj", displayName: "Anuj", password: "sec-cybert" }, { id: "meghan", displayName: "Meghan", password: "sec-cybert" }, { id: "xander", displayName: "Xander", password: "sec-cybert" }, { id: "elisabeth", displayName: "Elisabeth", password: "sec-cybert" }, { id: "joey", displayName: "Joey", password: "sec-cybert" }, { id: "admin", displayName: "Admin", password: "sec-cybert" }, ]; await db .insert(schema.annotators) .values(annotatorAccounts) .onConflictDoNothing(); console.log(` Created ${annotatorAccounts.length} annotator accounts`); console.log("Seed complete."); process.exit(0); } main().catch((err) => { console.error("Seed failed:", err); process.exit(1); });