2026-03-29 00:32:24 -04:00

88 lines
2.6 KiB
TypeScript

process.env.DATABASE_URL ??=
"postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert";
import { readFile, writeFile } from "node:fs/promises";
import { db } from "../db";
import * as schema from "../db/schema";
import {
type ParagraphWithVotes,
defaultSamplingConfig,
stratifiedSample,
} from "../lib/sampling";
async function readJsonl<T = unknown>(path: string): Promise<T[]> {
const text = await readFile(path, "utf-8");
return text
.split("\n")
.filter((l) => l.trim())
.map((l) => JSON.parse(l) as T);
}
interface AnnotationRow {
paragraphId: string;
label: {
content_category: string;
specificity_level: number;
};
}
const OUTPUT_PATH =
"/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json";
const ANNOTATIONS_PATH =
"/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl";
async function main() {
// 1. Load all paragraphs from DB
console.log("Loading paragraphs from DB...");
const dbParagraphs = await db.select().from(schema.paragraphs);
console.log(` ${dbParagraphs.length} paragraphs loaded`);
// 2. Load raw annotations for split-vote detection
console.log("Loading annotations for vote analysis...");
const annotations = await readJsonl<AnnotationRow>(ANNOTATIONS_PATH);
console.log(` ${annotations.length} annotations loaded`);
// Group votes by paragraph
const votesByParagraph = new Map<
string,
{ categories: string[]; specificities: number[] }
>();
for (const a of annotations) {
let votes = votesByParagraph.get(a.paragraphId);
if (!votes) {
votes = { categories: [], specificities: [] };
votesByParagraph.set(a.paragraphId, votes);
}
votes.categories.push(a.label.content_category);
votes.specificities.push(a.label.specificity_level);
}
// 3. Build ParagraphWithVotes array
const paragraphsWithVotes: ParagraphWithVotes[] = dbParagraphs.map((p) => {
const votes = votesByParagraph.get(p.id);
return {
id: p.id,
stage1Category: p.stage1Category,
stage1Specificity: p.stage1Specificity,
categoryVotes: votes?.categories ?? [],
specificityVotes: votes?.specificities ?? [],
};
});
// 4. Run stratified sampling
console.log("Running stratified sampling...");
const config = defaultSamplingConfig();
const selectedIds = stratifiedSample(paragraphsWithVotes, config);
// 5. Write output
await writeFile(OUTPUT_PATH, JSON.stringify(selectedIds, null, 2));
console.log(`\nWrote ${selectedIds.length} sampled IDs to ${OUTPUT_PATH}`);
process.exit(0);
}
main().catch((err) => {
console.error("Sampling failed:", err);
process.exit(1);
});