SEC-cyBERT/labelapp/scripts/sample.ts

process.env.DATABASE_URL ??=
  "postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert";

import { readFile, writeFile } from "node:fs/promises";
import { db } from "../db";
import * as schema from "../db/schema";
import {
  type ParagraphWithVotes,
  defaultSamplingConfig,
  stratifiedSample,
} from "../lib/sampling";

async function readJsonl<T = unknown>(path: string): Promise<T[]> {
  const text = await readFile(path, "utf-8");
  return text
    .split("\n")
    .filter((l) => l.trim())
    .map((l) => JSON.parse(l) as T);
}

interface AnnotationRow {
  paragraphId: string;
  label: {
    content_category: string;
    specificity_level: number;
  };
}

const OUTPUT_PATH =
  process.env.SAMPLED_IDS_PATH ??
  "/home/joey/Documents/sec-cyBERT/labelapp/.sampled-ids.json";
const ANNOTATIONS_PATH =
  process.env.SEED_ANNOTATIONS_PATH ??
  "/home/joey/Documents/sec-cyBERT/data/annotations/stage1.jsonl";

async function main() {
  // 1. Load all paragraphs from DB
  console.log("Loading paragraphs from DB...");
  const dbParagraphs = await db.select().from(schema.paragraphs);
  console.log(`  ${dbParagraphs.length} paragraphs loaded`);

  // 2. Load raw annotations for split-vote detection
  console.log("Loading annotations for vote analysis...");
  const annotations = await readJsonl<AnnotationRow>(ANNOTATIONS_PATH);
  console.log(`  ${annotations.length} annotations loaded`);

  // Group votes by paragraph
  const votesByParagraph = new Map<
    string,
    { categories: string[]; specificities: number[] }
  >();
  for (const a of annotations) {
    let votes = votesByParagraph.get(a.paragraphId);
    if (!votes) {
      votes = { categories: [], specificities: [] };
      votesByParagraph.set(a.paragraphId, votes);
    }
    votes.categories.push(a.label.content_category);
    votes.specificities.push(a.label.specificity_level);
  }

  // 3. Build ParagraphWithVotes array
  const paragraphsWithVotes: ParagraphWithVotes[] = dbParagraphs.map((p) => {
    const votes = votesByParagraph.get(p.id);
    return {
      id: p.id,
      stage1Category: p.stage1Category,
      stage1Specificity: p.stage1Specificity,
      categoryVotes: votes?.categories ?? [],
      specificityVotes: votes?.specificities ?? [],
    };
  });

  // 4. Run stratified sampling
  console.log("Running stratified sampling...");
  const config = defaultSamplingConfig();
  const selectedIds = stratifiedSample(paragraphsWithVotes, config);

  // 5. Write output
  await writeFile(OUTPUT_PATH, JSON.stringify(selectedIds, null, 2));
  console.log(`\nWrote ${selectedIds.length} sampled IDs to ${OUTPUT_PATH}`);

  process.exit(0);
}

main().catch((err) => {
  console.error("Sampling failed:", err);
  process.exit(1);
});