SEC-cyBERT/labelapp/lib/assignment.ts

/**
 * Generate all C(n, k) combinations of elements from `arr`.
 */
function combinations<T>(arr: T[], k: number): T[][] {
  if (k === 0) return [[]];
  if (k > arr.length) return [];
  const results: T[][] = [];
  for (let i = 0; i <= arr.length - k; i++) {
    const rest = combinations(arr.slice(i + 1), k - 1);
    for (const combo of rest) {
      results.push([arr[i], ...combo]);
    }
  }
  return results;
}

/**
 * Shuffle an array in place using Fisher-Yates.
 */
function shuffle<T>(arr: T[]): T[] {
  for (let i = arr.length - 1; i > 0; i--) {
    const j = Math.floor(Math.random() * (i + 1));
    [arr[i], arr[j]] = [arr[j], arr[i]];
  }
  return arr;
}

export interface Assignment {
  paragraphId: string;
  annotatorId: string;
}

/**
 * Generate BIBD assignments: each paragraph gets exactly `perParagraph` annotators,
 * distributed evenly across all C(n, perParagraph) annotator triples.
 *
 * With 6 annotators and perParagraph=3:
 * - C(6,3) = 20 unique triples
 * - Each triple gets floor(1200/20) = 60 paragraphs
 * - Each annotator appears in C(5,2) = 10 triples -> 600 paragraphs each
 */
export function generateAssignments(
  paragraphIds: string[],
  annotatorIds: string[],
  perParagraph: number,
): Assignment[] {
  const triples = combinations(annotatorIds, perParagraph);
  const shuffled = shuffle([...paragraphIds]);
  const perTriple = Math.floor(shuffled.length / triples.length);
  const remainder = shuffled.length % triples.length;

  const assignments: Assignment[] = [];
  let offset = 0;

  for (let t = 0; t < triples.length; t++) {
    // Distribute remainder paragraphs to the first `remainder` triples
    const count = perTriple + (t < remainder ? 1 : 0);
    const triple = triples[t];

    for (let i = 0; i < count; i++) {
      const paragraphId = shuffled[offset + i];
      for (const annotatorId of triple) {
        assignments.push({ paragraphId, annotatorId });
      }
    }

    offset += count;
  }

  return assignments;
}

/**
 * Print summary statistics for assignments.
 */
export function printAssignmentStats(
  assignments: Assignment[],
  annotatorIds: string[],
): void {
  // Per-annotator counts
  const perAnnotator = new Map<string, number>();
  for (const a of assignments) {
    perAnnotator.set(a.annotatorId, (perAnnotator.get(a.annotatorId) ?? 0) + 1);
  }

  console.log("\nPer-annotator assignment counts:");
  for (const id of annotatorIds) {
    console.log(`  ${id}: ${perAnnotator.get(id) ?? 0}`);
  }

  // Pairwise overlap: how many paragraphs each pair shares
  const paragraphAnnotators = new Map<string, Set<string>>();
  for (const a of assignments) {
    const s = paragraphAnnotators.get(a.paragraphId);
    if (s) {
      s.add(a.annotatorId);
    } else {
      paragraphAnnotators.set(a.paragraphId, new Set([a.annotatorId]));
    }
  }

  const pairCounts = new Map<string, number>();
  for (const annotators of paragraphAnnotators.values()) {
    const arr = [...annotators];
    for (let i = 0; i < arr.length; i++) {
      for (let j = i + 1; j < arr.length; j++) {
        const key = [arr[i], arr[j]].sort().join("|");
        pairCounts.set(key, (pairCounts.get(key) ?? 0) + 1);
      }
    }
  }

  console.log("\nPairwise overlap (paragraphs shared):");
  const pairs = [...pairCounts.entries()].sort((a, b) =>
    a[0].localeCompare(b[0]),
  );
  for (const [pair, count] of pairs) {
    const [a, b] = pair.split("|");
    console.log(`  ${a} & ${b}: ${count}`);
  }

  // Unique paragraphs
  console.log(`\nTotal unique paragraphs: ${paragraphAnnotators.size}`);
  console.log(`Total assignment rows: ${assignments.length}`);
}