process.env.DATABASE_URL ??= "postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert"; import { readFile, writeFile } from "node:fs/promises"; import { db } from "../db"; import * as schema from "../db/schema"; import { type ParagraphWithVotes, defaultSamplingConfig, stratifiedSample, } from "../lib/sampling"; async function readJsonl(path: string): Promise { const text = await readFile(path, "utf-8"); return text .split("\n") .filter((l) => l.trim()) .map((l) => JSON.parse(l) as T); } interface AnnotationRow { paragraphId: string; label: { content_category: string; specificity_level: number; }; } const OUTPUT_PATH = "/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json"; const ANNOTATIONS_PATH = "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl"; async function main() { // 1. Load all paragraphs from DB console.log("Loading paragraphs from DB..."); const dbParagraphs = await db.select().from(schema.paragraphs); console.log(` ${dbParagraphs.length} paragraphs loaded`); // 2. Load raw annotations for split-vote detection console.log("Loading annotations for vote analysis..."); const annotations = await readJsonl(ANNOTATIONS_PATH); console.log(` ${annotations.length} annotations loaded`); // Group votes by paragraph const votesByParagraph = new Map< string, { categories: string[]; specificities: number[] } >(); for (const a of annotations) { let votes = votesByParagraph.get(a.paragraphId); if (!votes) { votes = { categories: [], specificities: [] }; votesByParagraph.set(a.paragraphId, votes); } votes.categories.push(a.label.content_category); votes.specificities.push(a.label.specificity_level); } // 3. Build ParagraphWithVotes array const paragraphsWithVotes: ParagraphWithVotes[] = dbParagraphs.map((p) => { const votes = votesByParagraph.get(p.id); return { id: p.id, stage1Category: p.stage1Category, stage1Specificity: p.stage1Specificity, categoryVotes: votes?.categories ?? [], specificityVotes: votes?.specificities ?? [], }; }); // 4. Run stratified sampling console.log("Running stratified sampling..."); const config = defaultSamplingConfig(); const selectedIds = stratifiedSample(paragraphsWithVotes, config); // 5. Write output await writeFile(OUTPUT_PATH, JSON.stringify(selectedIds, null, 2)); console.log(`\nWrote ${selectedIds.length} sampled IDs to ${OUTPUT_PATH}`); process.exit(0); } main().catch((err) => { console.error("Sampling failed:", err); process.exit(1); });