SEC-cyBERT/labelapp/lib/sampling.ts

export interface ParagraphWithVotes {
  id: string;
  stage1Category: string | null;
  stage1Specificity: number | null;
  /** Raw category votes from stage1 annotations */
  categoryVotes: string[];
  /** Raw specificity votes from stage1 annotations */
  specificityVotes: number[];
}

export interface StratumConfig {
  name: string;
  count: number;
  filter: (p: ParagraphWithVotes) => boolean;
}

export interface SamplingConfig {
  total: number;
  strata: StratumConfig[];
}

/**
 * Shuffle an array in place using Fisher-Yates.
 */
function shuffle<T>(arr: T[]): T[] {
  for (let i = arr.length - 1; i > 0; i--) {
    const j = Math.floor(Math.random() * (i + 1));
    [arr[i], arr[j]] = [arr[j], arr[i]];
  }
  return arr;
}

/**
 * Check if a paragraph's annotations have a split between two specific categories.
 * A "split" means at least one vote for each of the two categories.
 */
function hasCategorySplit(
  p: ParagraphWithVotes,
  catA: string,
  catB: string,
): boolean {
  return (
    p.categoryVotes.includes(catA) && p.categoryVotes.includes(catB)
  );
}

/**
 * Check if a paragraph's specificity votes span between two specific values.
 */
function hasSpecificitySplit(
  p: ParagraphWithVotes,
  specA: number,
  specB: number,
): boolean {
  return (
    p.specificityVotes.includes(specA) &&
    p.specificityVotes.includes(specB)
  );
}

/**
 * Proportional stratified random sampling from category x specificity cells.
 * Fills the remaining `count` slots proportionally based on cell sizes.
 */
function proportionalSample(
  eligible: ParagraphWithVotes[],
  count: number,
): string[] {
  // Group by category x specificity
  const cells = new Map<string, ParagraphWithVotes[]>();
  for (const p of eligible) {
    const key = `${p.stage1Category ?? "unknown"}|${p.stage1Specificity ?? 0}`;
    const cell = cells.get(key);
    if (cell) {
      cell.push(p);
    } else {
      cells.set(key, [p]);
    }
  }

  const total = eligible.length;
  const selected: string[] = [];

  // First pass: allocate floor proportions
  const cellAllocations: { key: string; allocated: number; remainder: number }[] = [];
  let allocated = 0;

  for (const [key, members] of cells) {
    const exact = (members.length / total) * count;
    const floor = Math.floor(exact);
    cellAllocations.push({ key, allocated: floor, remainder: exact - floor });
    allocated += floor;
  }

  // Second pass: distribute remainder by largest remainders
  let remaining = count - allocated;
  cellAllocations.sort((a, b) => b.remainder - a.remainder);
  for (const cell of cellAllocations) {
    if (remaining <= 0) break;
    cell.allocated++;
    remaining--;
  }

  // Sample from each cell
  for (const { key, allocated: cellCount } of cellAllocations) {
    const members = cells.get(key)!;
    shuffle(members);
    const take = Math.min(cellCount, members.length);
    for (let i = 0; i < take; i++) {
      selected.push(members[i].id);
    }
  }

  return selected;
}

/**
 * Build the default sampling config for 1,200 paragraphs.
 */
export function defaultSamplingConfig(): SamplingConfig {
  return {
    total: 1200,
    strata: [
      {
        name: "Mgmt↔RMP split votes",
        count: 120,
        filter: (p) =>
          hasCategorySplit(p, "Management Role", "Risk Management Process"),
      },
      {
        name: "None/Other↔Strategy splits",
        count: 80,
        filter: (p) =>
          hasCategorySplit(p, "None/Other", "Strategy Integration"),
      },
      {
        name: "Spec [3,4] splits",
        count: 80,
        filter: (p) => hasSpecificitySplit(p, 3, 4),
      },
      {
        name: "Board↔Mgmt splits",
        count: 80,
        filter: (p) =>
          hasCategorySplit(p, "Board Governance", "Management Role"),
      },
    ],
  };
}

/**
 * Run stratified sampling. Returns selected paragraph IDs.
 *
 * Process:
 * 1. For each stratum, filter eligible paragraphs, randomly select `count`
 * 2. Already-selected paragraphs are excluded from later strata
 * 3. "Rare category guarantee": ensure >= 15 per category, extra for Incident Disclosure
 * 4. Final fill: proportional stratified random from category x specificity cells
 */
export function stratifiedSample(
  paragraphs: ParagraphWithVotes[],
  config: SamplingConfig,
): string[] {
  const selected = new Set<string>();

  // Phase 1: Named strata (split-vote strata)
  for (const stratum of config.strata) {
    const eligible = paragraphs.filter(
      (p) => !selected.has(p.id) && stratum.filter(p),
    );
    shuffle(eligible);
    const take = Math.min(stratum.count, eligible.length);
    for (let i = 0; i < take; i++) {
      selected.add(eligible[i].id);
    }
    console.log(
      `  Stratum "${stratum.name}": wanted ${stratum.count}, eligible ${eligible.length}, selected ${take}`,
    );
  }

  // Phase 2: Rare category guarantee (120 slots, >= 15 per category)
  const RARE_GUARANTEE_TOTAL = 120;
  const MIN_PER_CATEGORY = 15;
  const rareStartSize = selected.size;

  // Find all categories
  const categoryCounts = new Map<string, ParagraphWithVotes[]>();
  for (const p of paragraphs) {
    if (selected.has(p.id) || !p.stage1Category) continue;
    const cat = p.stage1Category;
    const bucket = categoryCounts.get(cat);
    if (bucket) {
      bucket.push(p);
    } else {
      categoryCounts.set(cat, [p]);
    }
  }

  // Count how many of each category are already selected
  const selectedByCat = new Map<string, number>();
  for (const id of selected) {
    const p = paragraphs.find((pp) => pp.id === id);
    if (p?.stage1Category) {
      selectedByCat.set(
        p.stage1Category,
        (selectedByCat.get(p.stage1Category) ?? 0) + 1,
      );
    }
  }

  // Top up categories that have fewer than MIN_PER_CATEGORY
  let rareAdded = 0;
  const allCategories = new Set<string>();
  for (const p of paragraphs) {
    if (p.stage1Category) allCategories.add(p.stage1Category);
  }

  // Sort categories by current count ascending so rarest get filled first
  const sortedCats = [...allCategories].sort(
    (a, b) =>
      (selectedByCat.get(a) ?? 0) - (selectedByCat.get(b) ?? 0),
  );

  for (const cat of sortedCats) {
    if (rareAdded >= RARE_GUARANTEE_TOTAL) break;
    const current = selectedByCat.get(cat) ?? 0;
    if (current >= MIN_PER_CATEGORY) continue;
    const need = MIN_PER_CATEGORY - current;
    const eligible = (categoryCounts.get(cat) ?? []).filter(
      (p) => !selected.has(p.id),
    );
    shuffle(eligible);
    const take = Math.min(need, eligible.length, RARE_GUARANTEE_TOTAL - rareAdded);
    for (let i = 0; i < take; i++) {
      selected.add(eligible[i].id);
      rareAdded++;
    }
  }

  // Give extra slots to "Incident Disclosure" if budget remains
  if (rareAdded < RARE_GUARANTEE_TOTAL) {
    const incidentEligible = (
      categoryCounts.get("Incident Disclosure") ?? []
    ).filter((p) => !selected.has(p.id));
    shuffle(incidentEligible);
    const take = Math.min(
      RARE_GUARANTEE_TOTAL - rareAdded,
      incidentEligible.length,
    );
    for (let i = 0; i < take; i++) {
      selected.add(incidentEligible[i].id);
      rareAdded++;
    }
  }

  console.log(
    `  Rare category guarantee: added ${selected.size - rareStartSize} (budget ${RARE_GUARANTEE_TOTAL})`,
  );

  // Phase 3: Proportional stratified random fill
  const remaining = config.total - selected.size;
  if (remaining > 0) {
    const eligible = paragraphs.filter(
      (p) => !selected.has(p.id) && p.stage1Category != null,
    );
    const filled = proportionalSample(eligible, remaining);
    for (const id of filled) {
      selected.add(id);
    }
    console.log(
      `  Proportional fill: added ${filled.length} (target ${remaining})`,
    );
  }

  console.log(`  Total selected: ${selected.size}`);
  return [...selected];
}