SEC-cyBERT/labelapp/lib/assignment.ts
2026-03-29 00:32:24 -04:00

126 lines
3.6 KiB
TypeScript

/**
* Generate all C(n, k) combinations of elements from `arr`.
*/
function combinations<T>(arr: T[], k: number): T[][] {
if (k === 0) return [[]];
if (k > arr.length) return [];
const results: T[][] = [];
for (let i = 0; i <= arr.length - k; i++) {
const rest = combinations(arr.slice(i + 1), k - 1);
for (const combo of rest) {
results.push([arr[i], ...combo]);
}
}
return results;
}
/**
* Shuffle an array in place using Fisher-Yates.
*/
function shuffle<T>(arr: T[]): T[] {
for (let i = arr.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[arr[i], arr[j]] = [arr[j], arr[i]];
}
return arr;
}
export interface Assignment {
paragraphId: string;
annotatorId: string;
}
/**
* Generate BIBD assignments: each paragraph gets exactly `perParagraph` annotators,
* distributed evenly across all C(n, perParagraph) annotator triples.
*
* With 6 annotators and perParagraph=3:
* - C(6,3) = 20 unique triples
* - Each triple gets floor(1200/20) = 60 paragraphs
* - Each annotator appears in C(5,2) = 10 triples -> 600 paragraphs each
*/
export function generateAssignments(
paragraphIds: string[],
annotatorIds: string[],
perParagraph: number,
): Assignment[] {
const triples = combinations(annotatorIds, perParagraph);
const shuffled = shuffle([...paragraphIds]);
const perTriple = Math.floor(shuffled.length / triples.length);
const remainder = shuffled.length % triples.length;
const assignments: Assignment[] = [];
let offset = 0;
for (let t = 0; t < triples.length; t++) {
// Distribute remainder paragraphs to the first `remainder` triples
const count = perTriple + (t < remainder ? 1 : 0);
const triple = triples[t];
for (let i = 0; i < count; i++) {
const paragraphId = shuffled[offset + i];
for (const annotatorId of triple) {
assignments.push({ paragraphId, annotatorId });
}
}
offset += count;
}
return assignments;
}
/**
* Print summary statistics for assignments.
*/
export function printAssignmentStats(
assignments: Assignment[],
annotatorIds: string[],
): void {
// Per-annotator counts
const perAnnotator = new Map<string, number>();
for (const a of assignments) {
perAnnotator.set(a.annotatorId, (perAnnotator.get(a.annotatorId) ?? 0) + 1);
}
console.log("\nPer-annotator assignment counts:");
for (const id of annotatorIds) {
console.log(` ${id}: ${perAnnotator.get(id) ?? 0}`);
}
// Pairwise overlap: how many paragraphs each pair shares
const paragraphAnnotators = new Map<string, Set<string>>();
for (const a of assignments) {
const s = paragraphAnnotators.get(a.paragraphId);
if (s) {
s.add(a.annotatorId);
} else {
paragraphAnnotators.set(a.paragraphId, new Set([a.annotatorId]));
}
}
const pairCounts = new Map<string, number>();
for (const annotators of paragraphAnnotators.values()) {
const arr = [...annotators];
for (let i = 0; i < arr.length; i++) {
for (let j = i + 1; j < arr.length; j++) {
const key = [arr[i], arr[j]].sort().join("|");
pairCounts.set(key, (pairCounts.get(key) ?? 0) + 1);
}
}
}
console.log("\nPairwise overlap (paragraphs shared):");
const pairs = [...pairCounts.entries()].sort((a, b) =>
a[0].localeCompare(b[0]),
);
for (const [pair, count] of pairs) {
const [a, b] = pair.split("|");
console.log(` ${a} & ${b}: ${count}`);
}
// Unique paragraphs
console.log(`\nTotal unique paragraphs: ${paragraphAnnotators.size}`);
console.log(`Total assignment rows: ${assignments.length}`);
}