90 lines
2.7 KiB
TypeScript
90 lines
2.7 KiB
TypeScript
process.env.DATABASE_URL ??=
|
|
"postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert";
|
|
|
|
import { readFile, writeFile } from "node:fs/promises";
|
|
import { db } from "../db";
|
|
import * as schema from "../db/schema";
|
|
import {
|
|
type ParagraphWithVotes,
|
|
defaultSamplingConfig,
|
|
stratifiedSample,
|
|
} from "../lib/sampling";
|
|
|
|
async function readJsonl<T = unknown>(path: string): Promise<T[]> {
|
|
const text = await readFile(path, "utf-8");
|
|
return text
|
|
.split("\n")
|
|
.filter((l) => l.trim())
|
|
.map((l) => JSON.parse(l) as T);
|
|
}
|
|
|
|
interface AnnotationRow {
|
|
paragraphId: string;
|
|
label: {
|
|
content_category: string;
|
|
specificity_level: number;
|
|
};
|
|
}
|
|
|
|
const OUTPUT_PATH =
|
|
process.env.SAMPLED_IDS_PATH ??
|
|
"/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json";
|
|
const ANNOTATIONS_PATH =
|
|
process.env.SEED_ANNOTATIONS_PATH ??
|
|
"/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl";
|
|
|
|
async function main() {
|
|
// 1. Load all paragraphs from DB
|
|
console.log("Loading paragraphs from DB...");
|
|
const dbParagraphs = await db.select().from(schema.paragraphs);
|
|
console.log(` ${dbParagraphs.length} paragraphs loaded`);
|
|
|
|
// 2. Load raw annotations for split-vote detection
|
|
console.log("Loading annotations for vote analysis...");
|
|
const annotations = await readJsonl<AnnotationRow>(ANNOTATIONS_PATH);
|
|
console.log(` ${annotations.length} annotations loaded`);
|
|
|
|
// Group votes by paragraph
|
|
const votesByParagraph = new Map<
|
|
string,
|
|
{ categories: string[]; specificities: number[] }
|
|
>();
|
|
for (const a of annotations) {
|
|
let votes = votesByParagraph.get(a.paragraphId);
|
|
if (!votes) {
|
|
votes = { categories: [], specificities: [] };
|
|
votesByParagraph.set(a.paragraphId, votes);
|
|
}
|
|
votes.categories.push(a.label.content_category);
|
|
votes.specificities.push(a.label.specificity_level);
|
|
}
|
|
|
|
// 3. Build ParagraphWithVotes array
|
|
const paragraphsWithVotes: ParagraphWithVotes[] = dbParagraphs.map((p) => {
|
|
const votes = votesByParagraph.get(p.id);
|
|
return {
|
|
id: p.id,
|
|
stage1Category: p.stage1Category,
|
|
stage1Specificity: p.stage1Specificity,
|
|
categoryVotes: votes?.categories ?? [],
|
|
specificityVotes: votes?.specificities ?? [],
|
|
};
|
|
});
|
|
|
|
// 4. Run stratified sampling
|
|
console.log("Running stratified sampling...");
|
|
const config = defaultSamplingConfig();
|
|
const selectedIds = stratifiedSample(paragraphsWithVotes, config);
|
|
|
|
// 5. Write output
|
|
await writeFile(OUTPUT_PATH, JSON.stringify(selectedIds, null, 2));
|
|
console.log(`\nWrote ${selectedIds.length} sampled IDs to ${OUTPUT_PATH}`);
|
|
|
|
process.exit(0);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Sampling failed:", err);
|
|
process.exit(1);
|
|
});
|